From f1f70dda9884f5e5bbc16206f5bcdc36be3ed4bc Mon Sep 17 00:00:00 2001
From: Emma Schymanski <massbank@eawag.ch>
Date: Fri, 8 Mar 2013 10:48:57 +0000
Subject: [PATCH] Added option to switch multiplicity filtering on/off; more
 informative progress bars

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/RMassBank@74064 bc3139a8-67e5-0310-9ffc-ced21a209358
---
 DESCRIPTION          |  2 +-
 R/createMassBank.R   | 17 +++++++++--------
 R/leMsMs.r           | 39 ++++++++++++++++++++++++++-------------
 R/settings_example.R |  7 ++++++-
 inst/RMB_options.ini |  5 +++++
 5 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 61228a4..63a11f8 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: RMassBank
 Type: Package
 Title: Workflow to process tandem MS files and build MassBank records
-Version: 1.1.0
+Version: 1.1.1
 Author: Michael Stravs, Emma Schymanski
 Maintainer: Michael Stravs, Emma Schymanski <massbank@eawag.ch>
 Description: Workflow to process tandem MS files and build MassBank records.
diff --git a/R/createMassBank.R b/R/createMassBank.R
index bb85c1b..44f8e95 100755
--- a/R/createMassBank.R
+++ b/R/createMassBank.R
@@ -1,5 +1,6 @@
 # Script for writing MassBank files
 
+#testtest change
 #' Load MassBank compound information lists
 #' 
 #' Loads MassBank compound information lists (i.e. the lists which were created
@@ -164,7 +165,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
   {
       mbdata_ids <- lapply(mb@aggregatedRcSpecs$specFound, function(spec) spec$id)
 	  
-	  message("mbWorkflow: Step 1")
+	  message("mbWorkflow: Step 1. Gather info from CTS")
 	  
       # Which IDs are not in mbdata_archive yet?
       new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id)
@@ -181,7 +182,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
   # Otherwise, continue!
   if(2 %in% steps)
   {
-	message("mbWorkflow: Step 2")
+	message("mbWorkflow: Step 2. Export infolist (if required)")
     if(length(mb@mbdata)>0)
     {
       mbdata_mat <- flatten(mb@mbdata)
@@ -196,7 +197,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
   # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format.
   if(3 %in% steps)
   {
-	message("mbWorkflow: Step 3")
+	message("mbWorkflow: Step 3. Data reformatting")
     mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata)
   }
   # Step 4: Compile the spectra! Using the skeletons from the archive data, create
@@ -204,7 +205,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
   # Also, assign accession numbers based on scan mode and relative scan no.
   if(4 %in% steps)
   {
-	  message("mbWorkflow: Step 4")
+	  message("mbWorkflow: Step 4. Spectra compilation")
 	  mb@compiled <- mapply(
 			  function(r, refiltered) {
 				  message(paste("Compiling: ", r$name, sep=""))
@@ -228,7 +229,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
   # flat-text string arrays (basically, into text-file style, but still in memory)
   if(5 %in% steps)
   {
-	message("mbWorkflow: Step 5")
+	message("mbWorkflow: Step 5. Flattening records")
     mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank))
   }
   # Step 6: For all OK records, generate a corresponding molfile with the structure
@@ -236,14 +237,14 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
   # is still in memory only, not yet a physical file)
   if(6 %in% steps)
   {
-	message("mbWorkflow: Step 6")
+	message("mbWorkflow: Step 6. Generate molfiles")
     mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(c[[1]][["CH$SMILES"]]))
   }
   # Step 7: If necessary, generate the appropriate subdirectories, and actually write
   # the files to disk.
   if(7 %in% steps)
   {
-	message("mbWorkflow: Step 7")
+	message("mbWorkflow: Step 7. Generate subdirs and export")
     dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE)
     dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE)
     for(cnt in 1:length(mb@compiled_ok))
@@ -253,7 +254,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
   # to attribute substances to their corresponding structure molfiles.
   if(8 %in% steps)
   {
-	message("mbWorkflow: Step 8")
+	message("mbWorkflow: Step 8. Create list.tsv")
     makeMollist(mb@compiled_ok)
   }
   return(mb)
diff --git a/R/leMsMs.r b/R/leMsMs.r
index ae518ab..b482618 100755
--- a/R/leMsMs.r
+++ b/R/leMsMs.r
@@ -69,7 +69,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca
   if(1 %in% steps)
   {
 	nProg <- 0
-	message("msmsWorkflow: Step 1")
+	message("msmsWorkflow: Step 1. Acquire all MSMS spectra from files")
 	pb <- txtProgressBar(0,nLen,0, style=3, file=stderr())
     w@specs <- lapply(w@files, function(fileName) {
 		
@@ -97,7 +97,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca
   if(2 %in% steps)
   {
 	  nProg <- 0
-	  message("msmsWorkflow: Step 2")
+	  message("msmsWorkflow: Step 2. First analysis pre recalibration")
 	  pb <- txtProgressBar(0,nLen,0, style=3, file=stderr())
 	  w@analyzedSpecs <- lapply(w@specs, function(spec) {
 				  #print(spec$id)
@@ -115,13 +115,13 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca
   # Step 3: aggregate all spectra
   if(3 %in% steps)
   {
-	message("msmsWorkflow: Step 3")
+	message("msmsWorkflow: Step 3. Aggregate all spectra")
     w@aggregatedSpecs <- aggregateSpectra(w@analyzedSpecs)
   }
   # Step 4: recalibrate all m/z values in raw spectra
   if(4 %in% steps)
   {
-	message("msmsWorkflow: Step 4")
+	message("msmsWorkflow: Step 4. Recalibrate m/z values in raw spectra")
 	if(newRecalibration)
 	{
 		recal <- makeRecalibration(w@aggregatedSpecs, mode)
@@ -134,7 +134,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca
   if(5 %in% steps)
   {
 	nProg <- 0
-	message("msmsWorkflow: Step 5")
+	message("msmsWorkflow: Step 5. Reanalyze recalibrated spectra")
 	pb <- txtProgressBar(0,nLen,0, style=3, file=stderr())
     w@analyzedRcSpecs <- lapply(w@recalibratedSpecs, function(spec) {
       s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="recalibrated", cut=0, cut_ratio=0 )
@@ -152,7 +152,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca
   # Step 6: aggregate recalibrated results
   if(6 %in% steps)
   {
-    message("msmsWorkflow: Step 6")
+    message("msmsWorkflow: Step 6. Aggregate recalibrated results")
     w@aggregatedRcSpecs <- aggregateSpectra(w@analyzedRcSpecs, addIncomplete=TRUE)
     if(!is.na(archivename))
       archiveResults(w, paste(archivename, ".RData", sep=''))
@@ -162,7 +162,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca
   # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks
   if(7 %in% steps)
   {
-	message("msmsWorkflow: Step 7")
+	message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O")
     w@reanalyzedRcSpecs <- reanalyzeFailpeaks(
 			w@aggregatedRcSpecs, custom_additions="N2O", mode=mode)
     if(!is.na(archivename))
@@ -172,7 +172,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca
   #         creation of failpeak list
   if(8 %in% steps)
   {
-	message("msmsWorkflow: Step 8")
+	message("msmsWorkflow: Step 8. Peak multiplicity filtering")
     # apply heuristic filter
     w@refilteredRcSpecs <- filterMultiplicity(
 			w@reanalyzedRcSpecs, archivename, mode)
@@ -1428,6 +1428,13 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE)
 #' @export
 filterMultiplicity <- function(specs, archivename=NA, mode="pH", recalcBest = TRUE)
 {
+    # Read multiplicity filter setting
+    # For backwards compatibility: If the option is not set, define as 2
+    # (which was the behaviour before we introduced the option)
+    multiplicityFilter <- getOption("RMassBank")$multiplicityFilter
+    if(is.null(multiplicityFilter))
+      multiplicityFilter <- 2
+    
     peaksFiltered <- filterPeaksMultiplicity(specs$peaksMatched,
                                                         "formula", recalcBest)
     peaksFilteredReanalysis <- 
@@ -1437,18 +1444,24 @@ filterMultiplicity <- function(specs, archivename=NA, mode="pH", recalcBest = TR
     peaksNoformula$formulaMultiplicity <- 0
     peaksNoformula$fM_factor <- as.factor(0)
     
-    
+	# Reorder the columns of peaksNoformula such that they match the columns
+	# of peaksFilteredReanalysis; such that rbind gives an identical result
+	# when peaksFilteredReanalysis is empty. (Otherwise peaksFilterReanalysis
+	# would be dropped as 0x0, and rbind's output column order would be the one originally
+	# in peaksNoformula. See ?cbind)
+	peaksNoformula <- peaksNoformula[,colnames(peaksFilteredReanalysis)]
+	
     # export the peaks which drop through reanalysis or filter criteria
     fp_rean <-  problematicPeaks(
                 rbind(
                   peaksFilteredReanalysis[ 
-						  peaksFilteredReanalysis$formulaMultiplicity < 2,],
+						  peaksFilteredReanalysis$formulaMultiplicity < multiplicityFilter,],
                   peaksNoformula),
                 specs$peaksMatched,
                 mode)
     fp_mult <-  problematicPeaks(
                 peaksFiltered[
-						peaksFiltered$formulaMultiplicity < 2,],
+						peaksFiltered$formulaMultiplicity < multiplicityFilter,],
                 specs$peaksMatched,
                 mode)
     fp_mult$good <- NULL
@@ -1479,9 +1492,9 @@ filterMultiplicity <- function(specs, archivename=NA, mode="pH", recalcBest = TR
                  "formulaCount", "parentScan", "aMax", "mzCenter")]
     
     peaksOK <- peaksFiltered[
-                      peaksFiltered$formulaMultiplicity > 1,]
+                      peaksFiltered$formulaMultiplicity > (multiplicityFilter - 1),]
     peaksReanOK <- peaksFilteredReanalysis[
-						(peaksFilteredReanalysis$formulaMultiplicity > 1) &
+						(peaksFilteredReanalysis$formulaMultiplicity > (multiplicityFilter - 1)) &
 						!is.na(peaksFilteredReanalysis$reanalyzed.formula),]
     # Kick the M+H+ satellites out of peaksReanOK:
     peaksReanOK$mzCenter <- as.numeric(
diff --git a/R/settings_example.R b/R/settings_example.R
index 49c51b2..a709439 100755
--- a/R/settings_example.R
+++ b/R/settings_example.R
@@ -182,7 +182,12 @@ NULL
   # The settings define which recal function is used
   recalibrator = list(
 	MS1 = "recalibrate.loess",
-	MS2 = "recalibrate.loess")
+	MS2 = "recalibrate.loess"),
+  # Define the multiplicity filtering level
+  # Default is 2 (peak occurs at least twice)
+  # Set this to 1 if you want to turn this option off.
+  # Set this to anything > 2 if you want harder filtering
+  multiplicityFilter = 2
   )
 
 # Writes a file with sample settings which the user can adjust with his values.
diff --git a/inst/RMB_options.ini b/inst/RMB_options.ini
index a29f6e3..e19f638 100755
--- a/inst/RMB_options.ini
+++ b/inst/RMB_options.ini
@@ -168,3 +168,8 @@ recalibrator:
     MS1: recalibrate.loess
     MS2: recalibrate.loess
 
+# Define the multiplicity filtering level
+# Default is 2 (peak occurs at least twice)
+# Set this to 1 if you want to turn this option off.
+# Set this to anything > 2 if you want harder filtering
+multiplicityFilter: 2
-- 
GitLab