RMassBank: updates from git repo

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/RMassBank@81148 bc3139a8-67e5-0310-9ffc-ced21a209358

RMassBank: updates from git repo
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/RMassBank@81148 bc3139a8-67e5-0310-9ffc-ced21a209358
3327b6be · Emma Schymanski · fd3f8d6b · 3327b6be · 3327b6be · 3327b6be
Commit 3327b6be authored 11 years ago by Emma Schymanski
--- a/DESCRIPTION
+++ b/DESCRIPTION
 Package: RMassBank
 Type: Package
 Title: Workflow to process tandem MS files and build MassBank records
-Version: 1.3.1
+Version: 1.3.2
 Authors@R: c(
    person(given = "RMassBank at Eawag", email = "massbank@eawag.ch",
    role=c("cre")),
@@ -9,11 +9,12 @@ Authors@R: c(
    "michael.stravs@eawag.ch", role=c("aut")), person(given = "Emma L.",
    family = "Schymanski", email = "emma.schymanski@eawag.ch", role=c("aut")),
    person(given = "Steffen", family = "Neumann", role = "aut", email =
-    "sneumann@ipb-halle.de"), person(given = "Erik", family = "Mueller", role =
+    "sneumann@ipb-halle.de"), person(given = "Erik", family = "Muller", role =
    "aut", email = "erik.mueller@student.uni-halle.de"), person(given =
    "Tobias", family = "Schulze", role = "ctb", email =
    "tobias.schulze@ufz.de") )
-Author: Michael Stravs, Emma Schymanski, Steffen Neumann, Erik Mueller, with contributions from Tobias Schulze
+Author: Michael Stravs, Emma Schymanski, Steffen Neumann, Erik Mueller, with
+    contributions from Tobias Schulze
 Maintainer: RMassBank at Eawag <massbank@eawag.ch>
 Description: Workflow to process tandem MS files and build MassBank records.
    Functions include automated extraction of tandem MS spectra, formula
@@ -24,11 +25,15 @@ License: Artistic-2.0
 SystemRequirements: OpenBabel
 biocViews: Bioinformatics, MassSpectrometry, Metabolomics, Software
 Depends:
-    rcdk,yaml,mzR,methods,rjson
+    mzR,rcdk,yaml,methods
 Imports:
-    XML,RCurl
+    XML,RCurl,rjson
 Suggests:
-    gplots,RMassBankData,xcms
+    gplots,RMassBankData,
+    xcms (>= 1.37.1),
+    CAMERA,
+    ontoCAT,
+    RUnit
 Collate:
    'createMassBank.R'
    'formulaCalculator.R'
@@ -38,5 +43,8 @@ Collate:
    'settings_example.R'
    'webAccess.R'
    'deprofile.R'
+    'parseMassBank.R'
    'RmbWorkspace.R'
+    'validateMassBank.R'
+    'zzz.R'
    'tools.R'
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,9 +3,14 @@ export(CTS.externalIdTypes)
 export(RmbDefaultSettings)
 export(RmbSettingsTemplate)
 export(add.formula)
+export(addMB)
 export(addPeaks)
+export(addPeaksManually)
 export(aggregateSpectra)
 export(analyzeMsMs)
+export(analyzeMsMs.formula)
+export(analyzeMsMs.intensity)
+export(annotator.default)
 export(archiveResults)
 export(cleanElnoise)
 export(combineMultiplicities)
@@ -31,6 +36,7 @@ export(findMsMsHR.mass)
 export(findMz)
 export(findMz.formula)
 export(findName)
+export(findProgress)
 export(findRt)
 export(findSmiles)
 export(flatten)
@@ -59,26 +65,38 @@ export(multiply.formula)
 export(newMbWorkspace)
 export(newMsmsWorkspace)
 export(order.formula)
+export(parseMassBank)
+export(plotRecalibration)
+export(plotRecalibration.direct)
 export(ppm)
 export(problematicPeaks)
+export(progressBarHook)
 export(readMbdata)
 export(reanalyzeFailpeak)
 export(reanalyzeFailpeaks)
 export(recalibrate)
 export(recalibrate.addMS1data)
+export(recalibrate.identity)
+export(recalibrate.linear)
 export(recalibrate.loess)
+export(recalibrate.mean)
 export(recalibrateSingleSpec)
 export(recalibrateSpectra)
 export(resetInfolists)
 export(resetList)
+export(smiles2mass)
 export(to.limits.rcdk)
 export(toMassbank)
+export(validate)
 exportClasses(mbWorkspace)
 exportClasses(msmsWorkspace)
 exportMethods(show)
 import(RCurl)
 import(XML)
+import(methods)
 import(mzR)
+import(rcdk)
 import(rjson)
+import(yaml)
 importClassesFrom(mzR)
 importMethodsFrom(mzR)
--- a/R/RmbWorkspace.R
+++ b/R/RmbWorkspace.R
-
+#' @import methods
+NULL

 #' Workspace for \code{msmsWorkflow} data
 #' 

--- a/R/createMassBank.R
+++ b/R/createMassBank.R
@@ -453,7 +453,7 @@ gatherData <- function(id)
 	}
 	else
 	{
-		iupacName <-infos$synonyms[[ipreferred]][["name"]]
+		iupacName <-infos$synonyms[[ipreferred[[1]]]][["name"]]
 	}
 	# Eliminate duplicate names from our list of 3
 	names <- as.list(unique(c(dbname, iupacName)))
@@ -916,9 +916,6 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, refiltered, additionalP
  peaks$dppm <- round(peaks$dppm, 2)
  peaks$mzCalc <- round(peaks$mzCalc, 4)
  peaks$int <- round(peaks$int, 1)
-  # I actually have no idea what the "num" entry in the annotation data
-  # is supposed to mean.
-  peaks$num <- 1
  # copy the peak table to the annotation table. (The peak table will then be extended
  # with peaks from the global "additional_peaks" table, which can be used to add peaks
  # to the spectra by hand.
@@ -971,7 +968,12 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, refiltered, additionalP
    "pM" = "+",
    "mM" = "-")
  type <- formula_tag[[spec$mode]]
-  annotation$formula <- paste(annotation$formula, type, sep='')
+  
+  annotator <- getOption("RMassBank")$annotator
+  if(is.null(annotator))
+    annotator <- "annotator.default"
+  
+  
  
  # Here, the relative intensity is recalculated using the newly added additional
  # peaks from the peak list. Therefore, we throw superfluous peaks out again.
@@ -980,9 +982,7 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, refiltered, additionalP
  annotation$intrel <- floor(annotation$int / max(peaks$int) * 999)
  annotation <- annotation[annotation$intrel >= 1,]
  
-  # Select the right columns and name them correctly for output.
-  annotation <- annotation[,c("mzSpec", "num", "formula", "mzCalc", "dppm")]
-  colnames(annotation) <- c("m/z", "num", "{formula", "mass", "error(ppm)}")
+  annotation <- do.call(annotator, list(annotation= annotation, type=type))
  
  # Create the "lower part" of the record.  
  mbdata <- list()
@@ -1002,9 +1002,17 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, refiltered, additionalP
  mbdata[["MS$FOCUSED_ION"]] <- ms_fi

  # the data processing tag :)
+  # Change by Tobias:
+  # I suggest to add here the current version number of the clone due to better distinction between different makes of MB records
+  # Could be automatised from DESCRIPTION file?
+  if(getOption("RMassBank")$use_rean_peaks)
+      processingComment <- list("REANALYZE" = "Peaks with additional N2/O included")
+  else
+      processingComment <- list()
  mbdata[["MS$DATA_PROCESSING"]] <- c(
    getOption("RMassBank")$annotations$ms_dataprocessing,
-    list("WHOLE" = "RMassBank")
+    processingComment,
+    list("WHOLE" = paste("RMassBank", packageVersion("RMassBank")))
    )
  
  # Annotation:
@@ -1016,7 +1024,20 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, refiltered, additionalP
  # These two entries will be thrown out later, but they are necessary to build the
  # record title and the accession number.
  mbdata[["RECORD_TITLE_CE"]] <- msmsdata$info$ces #formatted collision energy
-  mbdata[["SUBSCAN"]] <- msmsdata$scan - spec$parentHeader$acquisitionNum #relative scan
+  # Mode of relative scan calculation: by default it is calculated relative to the
+  # parent scan. If a corresponding option is set, it will be calculated from the first
+  # present child scan in the list.
+  relativeScan <- "fromParent"
+  if(!is.null(getOption("RMassBank")$recomputeRelativeScan))
+	  if(getOption("RMassBank")$recomputeRelativeScan == "fromFirstChild")
+		  relativeScan <- "fromFirstChild"
+  if(relativeScan == "fromParent")
+	  mbdata[["SUBSCAN"]] <- msmsdata$scan - spec$parentHeader$acquisitionNum #relative scan
+  else if(relativeScan == "fromFirstChild")
+  {
+	  firstChild <- min(unlist(lapply(spec,function(d) d$header$acquisitionNum)))
+	  mbdata[["SUBSCAN"]] <- msmsdata$scan - firstChild + 1
+  }
  return(mbdata)
 }

@@ -1092,32 +1113,16 @@ compileRecord <- function(spec, mbdata, refiltered, additionalPeaks = NULL)
      # Here is the right place to fix the name of the INTERNAL ID field.
      names(mbrecord[["COMMENT"]])[[which(names(mbrecord[["COMMENT"]]) == "ID")]] <-
        getOption("RMassBank")$annotations$internal_id_fieldname
-      # The fields are named differently in MB record definitions v.1 and 2.
-      # Therefore, the title is composed slightly differently (with the same result.)
+	  # get mode parameter (for accession number generation) depending on version 
+	  # of record definition
+	  # Change by Tobias:
+	  # I suggest to include fragmentation mode here for information
      if(getOption("RMassBank")$use_version == 2)
-      {
-        mbrecord[["RECORD_TITLE"]] <- paste(
-          mbrecord[["CH$NAME"]][[1]],
-          mbrecord[["AC$INSTRUMENT_TYPE"]],
-          mbrecord[["AC$MASS_SPECTROMETRY"]][["MS_TYPE"]],
-          mbrecord[["RECORD_TITLE_CE"]],
-          paste("R=",mbrecord[["AC$MASS_SPECTROMETRY"]][["RESOLUTION"]], sep='' ),
-          mbrecord[["MS$FOCUSED_ION"]][["PRECURSOR_TYPE"]],
-          sep="; ")
        mode <- mbrecord[["AC$MASS_SPECTROMETRY"]][["ION_MODE"]]
-      }
      else
-      {
-        mbrecord[["RECORD_TITLE"]] <- paste(
-          mbrecord[["CH$NAME"]][[1]],
-          mbrecord[["AC$INSTRUMENT_TYPE"]],
-          mbrecord[["AC$ANALYTICAL_CONDITION"]][["MS_TYPE"]],
-          paste("CE: ", mbrecord[["RECORD_TITLE_CE"]], sep=''),
-          paste("R=",mbrecord[["AC$ANALYTICAL_CONDITION"]][["RESOLUTION"]], sep='' ),
-          mbrecord[["MS$FOCUSED_ION"]][["PRECURSOR_TYPE"]],
-          sep="; ")
        mode <- mbrecord[["AC$ANALYTICAL_CONDITION"]][["MODE"]]
-      }
+	  # Generate the title and then delete the temprary RECORD_TITLE_CE field used before
+	  mbrecord[["RECORD_TITLE"]] <- .parseTitleString(mbrecord)
      mbrecord[["RECORD_TITLE_CE"]] <- NULL
      # Calculate the accession number from the options.
      shift <- getOption("RMassBank")$accessionNumberShifts[[spec$mode]]
@@ -1129,6 +1134,137 @@ compileRecord <- function(spec, mbdata, refiltered, additionalPeaks = NULL)
  })
 }

+
+
+#' Generate peak annotation from peaklist
+#' 
+#' Generates the PK$ANNOTATION entry from the peaklist obtained. This function is
+#' overridable by using the "annotator" option in the settings file.
+#' 
+#' @param annotation A peak list to be annotated. Contains columns:
+#' \code{"cpdID","formula","mzFound" ,"scan","mzCalc","dppm",
+#'      "dbe","mz","int","formulaCount","parentScan","fM_factor","dppmBest",
+#'     "formulaMultiplicity","intrel","mzSpec"}
+#' 
+#' @param type The ion type to be added to annotated formulas ("+" or "-" usually)
+#' 
+#' @return The annotated peak table. Table \code{colnames()} will be used for the
+#' 		titles (preferrably don't use spaces in the column titles; however no format is
+#' 		strictly enforced by the MassBank data format.
+#' 
+#' @examples 
+#' \dontrun{
+#' annotation <- annotator.default(annotation)
+#' }
+#' @author Michele Stravs, Eawag <stravsmi@@eawag.ch>
+#' @export
+annotator.default <- function(annotation, type)
+{
+  
+  annotation$formula <- paste(annotation$formula, type, sep='')
+  # Select the right columns and name them correctly for output.
+  annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")]
+  colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)")
+  return(annotation)
+}
+
+#' Parse record title
+#' 
+#' Parses a title for a single MassBank record using the title format
+#' specified in the option titleFormat. Internally used, not exported.
+#' 
+#' If the option is not set, a standard title format is used (for record definition
+#' version 1 or 2).
+#' 
+#' @usage .parseTitleString(mbrecord)
+#' @param mbrecord A MassBank record in list format, as returned from
+#' 	\code{\link{gatherSpectrum}}.
+#' @return A string with the title.
+#' @author Michael Stravs, Eawag
+#' @seealso \code{\link{compileRecord}}
+#' @references MassBank record format:
+#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
+#' @examples
+#' \dontrun{
+#' 		# used in compileRecord()
+#' 		title <- .parseTitleString(mbrecord)
+#' }
+#' 
+#' 
+#' 
+.parseTitleString <- function(mbrecord)
+{
+	
+	varlist <- getOption("RMassBank")$titleFormat
+	
+	# Set the standard title format.
+	if(is.null(varlist))
+	{
+		if(getOption("RMassBank")$use_version == 2)
+		{
+			varlist <- c(
+					"{CH$NAME}",
+					"{AC$INSTRUMENT_TYPE}",
+					"{AC$MASS_SPECTROMETRY: MS_TYPE}",
+					"CE: {RECORD_TITLE_CE}",
+					"R={AC$MASS_SPECTROMETRY: RESOLUTION}",
+					"{MS$FOCUSED_ION: PRECURSOR_TYPE}"
+			)
+		}
+		else
+		{
+			varlist <- c(
+					"{CH$NAME}",
+					"{AC$INSTRUMENT_TYPE}",
+					"{AC$ANALYTICAL_CONDITION: MS_TYPE}",
+					"CE: {RECORD_TITLE_CE}",
+					"R={AC$ANALYTICAL_CONDITION: RESOLUTION}",
+					"{MS$FOCUSED_ION: PRECURSOR_TYPE}"
+			)
+		}
+	}
+  
+	
+	# Extract a {XXX} argument from each title section.
+	# check that every title has one and only one match
+	args <- regexec("\\{(.*)\\}", varlist)
+	arglist <- regmatches(varlist, args)
+	if(any(unlist(lapply(arglist, length)) != 2))
+		stop("Title format is incorrectly specified: a section with not exactly 1 parameters")
+	
+	parsedVars <- lapply(varlist, function(var)
+			{
+				# Extract the specified parameter inside the {}.
+				# I.e. from a string like "R={BLA: BLUB}" return "BLA: BLUB"
+				args <- regexec("\\{(.*)\\}", var)
+				arg <- regmatches(var, args)[[1]][[2]]
+				
+				# Split the parameter by colon if necessary
+				splitVar <- strsplit(arg, ": ")[[1]]
+				# Read the parameter value from the record
+				if(length(splitVar) == 2)
+					replaceVar <- mbrecord[[splitVar[[1]]]][[splitVar[[2]]]]
+				else if(length(splitVar) ==  1)
+					replaceVar <- mbrecord[[splitVar]]
+				else
+					stop(paste(
+									"Title format is incorrectly specified:", var)
+					)
+				# Fix problems: NULL returns
+				if(is.null(replaceVar))
+					replaceVar <- ""
+				# Fix problems: Names will have >= 1 match. Take the first
+				if(length(replaceVar) > 1)
+					replaceVar <- replaceVar[[1]]
+				# Substitute the parameter value into the string
+				parsedVar <- sub("\\{(.*)\\}", replaceVar, var)	
+				return(parsedVar)
+			})
+	title <- paste(parsedVars, collapse="; ")
+	return(title)
+}
+
+
 # This converts the tree-like list (as obtained e.g. from compileRecord())
 # into a plain text array, which can then be dumped to a file suitable for 
 # MassBank upload.

--- a/R/formulaCalculator.R
+++ b/R/formulaCalculator.R
-
+#' @import rcdk
+NULL




--- a/R/leCsvAccess.R
+++ b/R/leCsvAccess.R
@@ -265,5 +265,4 @@ findMass <- function(cpdID_or_smiles)
 		s <- findSmiles(cpdID_or_smiles)
 	mol <- getMolecule(s)
 	return(get.exact.mass(mol))
-}
-
+}
\ No newline at end of file
--- a/R/leMsMs.r
+++ b/R/leMsMs.r
--- a/R/leMsmsRaw.R
+++ b/R/leMsmsRaw.R
@@ -9,15 +9,30 @@ NULL
 #' Extracts MS/MS spectra from LC-MS raw data for a specified precursor, specified
 #' either via the RMassBank compound list (see \code{\link{loadList}}) or via a mass.
 #' 
-#' Different versions of the function get the data from different sources.
-#' 
-#' @usage findMsMsHR(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, dppm=10)
+#' Different versions of the function get the data from different sources. Note that 
+#' 		findMsMsHR and findMsMsHR.direct differ mainly in that findMsMsHR opens a file
+#' 		whereas findMsMs.direct uses an open file handle - both are intended to be used
+#' 		in a full process which involves compound lists etc. In contrast, findMsMsHR.mass
+#' 		is a low-level function which uses the mass directly for lookup and is intended for
+#' 		use as a standalone function in unrelated applications.
 #' 
+#' @usage findMsMsHR(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE,
+#' 		ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine,
+#' 		mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse,
+#' 		fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan,
+#' 		rtMargin = getOption("RMassBank")$rtMargin,
+#' 		deprofile = getOption("RMassBank")$deprofile)
+#' 		
 #' 		findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA,
-#' 		headerCache = NA)
-#' 
-#' 		findMsMsHR.direct(msRaw, cpdID, mode = "pH", confirmMode = 0,
-#'  	useRtLimit = TRUE, dppm=10, limit.coarse=0.5)
+#' 		headerCache = NA, fillPrecursorScan = FALSE,
+#' 		deprofile = getOption("RMassBank")$deprofile)
+#'
+#' findMsMsHR.direct(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE, 
+#'			ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine,
+#'			mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse,
+#'			fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan,
+#'			rtMargin = getOption("RMassBank")$rtMargin,
+#'			deprofile = getOption("RMassBank")$deprofile, headerCache = NA)
 #' 
 #' @aliases findMsMsHR.mass findMsMsHR.direct findMsMsHR
 #' @param fileName The file to open and search the MS2 spectrum in.
@@ -25,13 +40,16 @@ NULL
 #' @param cpdID The compound ID in the compound list (see \code{\link{loadList}})
 #' 			to use for formula lookup.
 #' @param mz The mass to use for spectrum search.
-#' @param dppm The limit in ppm to use for fine limit (see below) calculation.
-#' @param limit.coarse The coarse limit to use for locating potential MS2 scans:
+#' @param ppmFine The limit in ppm to use for fine limit (see below) calculation.
+#' @param mzCoarse The coarse limit to use for locating potential MS2 scans:
 #'			this tolerance is used when finding scans with a suitable precursor
 #' 			ion value.  
 #' @param limit.fine The fine limit to use for locating MS2 scans: this tolerance
 #' 			is used when locating an appropriate analyte peak in the MS1 precursor
-#' 			spectrum.
+#' 			spectrum. 
+#' @param limit.coarse Parameter in \code{findMsMsHR.mass} corresponding to \code{mzCoarse}.
+#' 			(The parameters are distinct to clearly conceptually distinguish findMsMsHR.mass
+#' 			(a standalone useful function) from the cpdID based functions (workflow functions).)
 #' @param mode The processing mode (determines which ion/adduct is searched):
 #' 			\code{"pH", "pNa", "pM", "mH", "mM", "mFA"} for different ions 
 #' 			([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-). 
@@ -47,6 +65,11 @@ NULL
 #' @param maxCount The maximal number of spectra groups to return. One spectra group
 #' 			consists of all data-dependent scans from the same precursor whose precursor
 #' 			mass matches the specified search mass.
+#' @param fillPrecursorScan If \code{TRUE}, the precursor scan will be filled from MS1 data.
+#' 			To be used for data where the precursor scan is not stored in the raw data.
+#' @param rtMargin	The retention time tolerance to use.
+#' @param deprofile	Whether deprofiling should take place, and what method should be
+#' 			used (cf. \code{\link{deprofile}}) 
 #' @return	For \code{findMsMsHR} and \code{findMsMsHR.direct}: A "spectrum set", a list with items:
 #' 			\item{foundOK}{\code{TRUE} if a spectrum was found, \code{FALSE} otherwise.
 #' 				Note: if \code{FALSE}, all other values can be missing!}
@@ -76,36 +99,55 @@ NULL
 #' @author Michael A. Stravs, Eawag <michael.stravs@@eawag.ch>
 #' @seealso findEIC
 #' @export
-findMsMsHR <- function(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, dppm=10)
+findMsMsHR <- function(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE,
+		ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine,
+		mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse,
+		fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan,
+		rtMargin = getOption("RMassBank")$rtMargin,
+		deprofile = getOption("RMassBank")$deprofile)
 {
 	
 	# access data directly for finding the MS/MS data. This is done using
 	# mzR.
 	msRaw <- openMSfile(fileName)
-	ret <- findMsMsHR.direct(msRaw, cpdID, mode, confirmMode, useRtLimit, dppm)
+	ret <- findMsMsHR.direct(msRaw, cpdID, mode, confirmMode, useRtLimit, ppmFine, mzCoarse, fillPrecursorScan,
+				rtMargin, deprofile)
 	mzR::close(msRaw)
 	return(ret)
 }

 #' @export
 findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA,
-		headerCache = NA)
+		headerCache = NA, fillPrecursorScan = FALSE,
+		deprofile = getOption("RMassBank")$deprofile)
 {
 	eic <- findEIC(msRaw, mz, limit.fine, rtLimits)
 	#	if(!is.na(rtLimits))
 	#	{  
 	#		eic <- subset(eic, rt >= rtLimits[[1]] & rt <= rtLimits[[2]])
 	#	}
-	if(!is.na(headerCache))
+	if(!all(is.na(headerCache)))
 		headerData <- headerCache
 	else
 		headerData <- as.data.frame(header(msRaw))
 	
+	if(fillPrecursorScan == TRUE)
+	{
+		# reset the precursor scan number. first set to NA, then
+		# carry forward the precursor scan number from the last parent scan
+		headerData$precursorScanNum <- NA
+		headerData[which(headerData$msLevel == 1),"precursorScanNum"] <-
+				headerData[which(headerData$msLevel == 1),"acquisitionNum"]
+		headerData[,"precursorScanNum"] <- .locf(headerData[,"precursorScanNum"])
+		# Clear the actual MS1 precursor scan number again
+		headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- 0
+	}
+	
 	# Find MS2 spectra with precursors which are in the allowed 
 	# scan filter (coarse limit) range
 	findValidPrecursors <- headerData[
 			(headerData$precursorMZ > mz - limit.coarse) &
-			(headerData$precursorMZ < mz + limit.coarse),]
+					(headerData$precursorMZ < mz + limit.coarse),]
 	# Find the precursors for the found spectra
 	validPrecursors <- unique(findValidPrecursors$precursorScanNum)
 	# check whether the precursors are real: must be within fine limits!
@@ -138,17 +180,17 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA,
 			{
 				masterHeader <- headerData[headerData$acquisitionNum == masterScan,]
 				childHeaders <- headerData[(headerData$precursorScanNum == masterScan) 
-					& (headerData$precursorMZ > mz - limit.coarse) 
-					& (headerData$precursorMZ < mz + limit.coarse) ,]
+								& (headerData$precursorMZ > mz - limit.coarse) 
+								& (headerData$precursorMZ < mz + limit.coarse) ,]
 				childScans <- childHeaders$acquisitionNum
 				
 				msPeaks <- mzR::peaks(msRaw, masterHeader$seqNum)
 				# if deprofile option is set: run deprofiling
-				deprofile.setting <- getOption("RMassBank")$deprofile
+				deprofile.setting <- deprofile
 				if(!is.na(deprofile.setting))
 					msPeaks <- deprofile.scan(
 							msPeaks, method = deprofile.setting, noise = NA, colnames = FALSE
-							)
+					)
 				colnames(msPeaks) <- c("mz","int")
 				msmsPeaks <- lapply(childHeaders$seqNum, function(scan)
 						{
@@ -180,7 +222,13 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA,
 }

 #' @export
-findMsMsHR.direct <- function(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE, dppm=10, limit.coarse=0.5)
+findMsMsHR.direct <- function(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE, 
+			ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine,
+			mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse,
+			fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan,
+			rtMargin = getOption("RMassBank")$rtMargin,
+			deprofile = getOption("RMassBank")$deprofile,
+      headerCache = NA)
 {
  # for finding the peak RT: use the gauss-fitted centwave peak
  # (centroid data converted with TOPP is necessary. save as
@@ -190,21 +238,28 @@ findMsMsHR.direct <- function(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtL
  # find cpd m/z
  mzLimits <- findMz(cpdID, mode)
  mz <- mzLimits$mzCenter
-  limit.fine <- ppm(mz, dppm, p=TRUE)
+  limit.fine <- ppm(mz, ppmFine, p=TRUE)
  if(!useRtLimit)
 	  rtLimits <- NA
  else
  {
-	  rtMargin <- getOption("RMassBank")$rtMargin
 	  dbRt <- findRt(cpdID)
 	  rtLimits <- c(dbRt$RT - rtMargin, dbRt$RT + rtMargin) * 60
  }
-  spectra <- findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits, confirmMode + 1)
-  spectra[[confirmMode + 1]]$mz <- mzLimits
-  return(spectra[[confirmMode + 1]])
+  spectra <- findMsMsHR.mass(msRaw, mz, mzCoarse, limit.fine, rtLimits, confirmMode + 1,headerCache
+  	,fillPrecursorScan, deprofile)
+  # check whether a) spectrum was found and b) enough spectra were found
+  if(length(spectra) < (confirmMode + 1))
+    sp <- list(foundOK = FALSE)
+  else
+    sp <- spectra[[confirmMode + 1]]
+  
+  sp$mz <- mzLimits
+  sp$id <- cpdID
+  sp$formula <- findFormula(cpdID)
+  return(sp)
 }

-
 # Finds the EIC for a mass trace with a window of x ppm.
 # (For ppm = 10, this is +5 / -5 ppm from the non-recalibrated mz.)
 #' Extract EICs 
@@ -248,3 +303,115 @@ findEIC <- function(msRaw, mz, limit = NULL, rtLimit = NA)
 	scan <- headerMS1$acquisitionNum
 	return(data.frame(rt = rt, intensity=pks_t, scan=scan))
 }
+
+
+#' Addition of manual peaklists
+#' 
+#' Adds a manual peaklist in matrix-format
+#'
+#' @usage addPeaksManually(w, cpdID, handSpec, mode)
+#' @param w The msmsWorkspace that the peaklist should be added to.
+#' @param cpdID The compoundID of the compound that has been used for the peaklist
+#' @param handSpec A peaklist with 2 columns, 
+#' @param mode The ionization mode that has been used for the spectrum represented by the peaklist
+#' @return The \code{msmsWorkspace} with the additional peaklist added to the right spectrum
+#' @seealso \code{\link{msmsWorkflow}}
+#' @author Erik Mueller
+#' @examples \dontrun{
+#' 		handSpec <- matrix(0,4,2)
+#' 		handSpec[,1] <- c(274.986685367956, 259.012401087427, 95.9493025990907, 96.9573002472772)
+#' 		handSpec[,2] <- c(357,761, 2821, 3446)
+#' 		addPeaksManually(w, cpdID, handSpec)
+#' }
+#' @export
+addPeaksManually <- function(w, cpdID, handSpec, mode = "pH"){
+	childHeaderAddition <- t(sapply(handSpec, function(spec){
+			header <- vector()
+			header[1:3] <- 2
+			header[4] <- length(spec[,1])
+			header[5] <- 0 ##Does this matter?
+			header[6] <- findRt(cpdID)$RT * 60
+			header[7] <- spec[which.max(spec[,2]),1]
+			header[8] <- max(spec[,2])
+			header[9] <- 0 ##Does this matter?
+			header[10] <- 0 ##Does this matter?
+			header[11] <- min(spec[,1])
+			header[12] <- max(spec[,1])
+			header[13] <- 1
+			header[14] <- findMz(cpdID)[[3]]
+			header[15] <- -1 ##Will be changed for different charges
+			header[16] <- 0 ##There sadly isnt any precursor intensity to find in the msms-scans. Workaround? msmsXCMS@files[1]
+			header[17:20] <- 0 ##Will be changed if merge is wanted
+			return(header)
+		}))
+	##Set colnames and rownames
+	colnames(childHeaderAddition) <- c("seqNum", "acquisitionNum", "msLevel", "peaksCount", "totIonCurrent", "retentionTime", "basepeakMZ", 
+										"basePeakIntensity", "collisionEnergy", "ionisationEnergy", "lowMZ", "highMZ", "precursorScanNum",
+										"precursorMZ", "precursorCharge", "precursorIntensity", "mergedScan", "mergedResultScanNum", 
+										"mergedResultStartScanNum", "mergedResultEndScanNum")
+	##Convert the manual peaklists
+	peaksHand <- lapply (handSpec, function(specs){
+							peaks <- matrix(nrow = length(specs[,1]), ncol = 2)
+							colnames(peaks) <- c("mz","int")
+							peaks <- specs
+							return(peaks)
+						})
+	
+	##Where do the peaks and the header need to be added?
+	pos <- sapply(w@specs,function(spec){cpdID %in% spec$id})
+	##If the compound for the cpdID isn't in specs yet, add a new spectrum
+	if(length(pos) == 0){
+		pos <- length(w@specs) + 1
+		childHeaderAddition[,1:2] <- 1
+		w@specs[[pos]] <- list()
+		w@specs[[pos]]$foundOK <- 1
+		w@specs[[pos]]$parentscan <- 1
+		w@specs[[pos]]$parentHeader <- matrix(0, ncol = 20, nrow = 1)
+		rownames(w@specs[[pos]]$parentHeader) <- 1
+		colnames(w@specs[[pos]]$parentHeader) <- c("seqNum", "acquisitionNum", "msLevel", "peaksCount", "totIonCurrent", "retentionTime", "basepeakMZ", 
+									"basePeakIntensity", "collisionEnergy", "ionisationEnergy", "lowMZ", "highMZ", "precursorScanNum",
+									"precursorMZ", "precursorCharge", "precursorIntensity", "mergedScan", "mergedResultScanNum", 
+									"mergedResultStartScanNum", "mergedResultEndScanNum")
+		w@specs[[pos]]$parentHeader[1,1:3] <- 1
+		w@specs[[pos]]$parentHeader[1,4:20] <- 0
+		w@specs[[pos]]$parentHeader <- as.data.frame(w@specs[[pos]]$parentHeader)
+		w@specs[[pos]]$childScans <- 1
+		w@specs[[pos]]$childHeader <- as.data.frame(childHeaderAddition)
+		w@specs[[pos]]$parentPeak <- matrix(nrow = 1, ncol = 2)
+		colnames(w@specs[[pos]]$parentPeak) <- c("mz","int")
+		w@specs[[pos]]$parentPeak[1,] <- c(findMz(cpdID,mode=mode)$mzCenter,100)
+		w@specs[[pos]]$peaks <- peaksHand
+		w@specs[[pos]]$mz <- findMz(cpdID,mode=mode)
+		w@specs[[pos]]$id <- cpdID
+		w@specs[[pos]]$formula <- findFormula(cpdID)
+	} else { pos <- which(pos)
+			w@specs[[pos]]$childHeader <- rbind(w@specs[[pos]]$childHeader,childHeaderAddition)
+			w@specs[[pos]]$peaks <- c(w@specs[[pos]]$peaks, peaksHand) }
+		
+		return(w)
+}
+
+#' MassBank-record Addition
+#' 
+#' Adds the peaklist of a MassBank-Record to the specs of an msmsWorkspace
+#'
+#' @aliases addMB
+#' @usage addMB(w, cpdID, fileName, mode)
+#' @param w The msmsWorkspace that the peaklist should be added to.
+#' @param cpdID The compoundID of the compound that has been used for the record
+#' @param fileName The path to the record
+#' @param mode The ionization mode that has been used to create the record
+#' @return The \code{msmsWorkspace} with the additional peaklist from the record
+#' @seealso \code{\link{addPeaksManually}}
+#' @author Erik Mueller
+#' @examples \dontrun{
+#' 		addMB("filepath_to_records/RC00001.txt")
+#' }
+#' @export
+addMB <- function(w, cpdID, fileName, mode){
+	mb <- parseMassBank(fileName)
+	peaklist <- list()
+	peaklist[[1]] <- mb@compiled_ok[[1]][["PK$PEAK"]][,1:2]
+	w <- addPeaksManually(w, cpdID, peaklist[[1]], mode)
+	return(w)
+}
--- a/R/parseMassBank.R
+++ b/R/parseMassBank.R
+#' MassBank-record Parser
+#' 
+#' Can parse MassBank-records(only V2)
+#'
+#' @aliases parseMassBank
+#' @usage parseMassBank(Files)
+#' @param Files A path to the plaintext-record that should be read
+#' @return The \code{mbWorkspace} that the plaintext-record creates.
+#' @seealso \code{\link{validate}}
+#' @author Erik Mueller
+#' @examples \dontrun{
+#' 		parseMassBank("filepath_to_records/RC00001.txt")
+#' }
+#' @export
+parseMassBank <- function(Files){
+	mb <- new("mbWorkspace")
+	mb@compiled_ok <- list()
+	i <- 1
+		fileConnection <- file(normalizePath(Files[i]))
+		record <- readLines(fileConnection)
+		close(fileConnection)
+		mb@compiled_ok[[i]] <- list()
+		mb@compiled_ok[[i]][['ACCESSION']] <- substring(grep('ACCESSION:',record, value = TRUE, fixed = TRUE),12)
+		mb@compiled_ok[[i]][['RECORD_TITLE']] <- substring(grep('RECORD_TITLE:',record, value = TRUE),12)
+		mb@compiled_ok[[i]][['DATE']] <- format(as.Date(substring(grep('DATE:',record, value = TRUE, fixed = TRUE),7), format = "%Y.%m.%d"), "%Y.%m.%d")
+		mb@compiled_ok[[i]][['AUTHORS']] <- substring(grep('AUTHORS:',record, value = TRUE, fixed = TRUE),10)
+		mb@compiled_ok[[i]][['LICENSE']] <- substring(grep('LICENSE:',record, value = TRUE, fixed = TRUE),10)
+		mb@compiled_ok[[i]][['COPYRIGHT']] <- substring(grep('COPYRIGHT:',record, value = TRUE, fixed = TRUE),12)
+		##publication <- substring(grep('PUBLICATION:',record, fixed = TRUE),14)
+		##if(length(publication) > 0){
+		#mb@compiled_ok[[i]][['PUBLICATION']] <- publication
+		##}
+	
+		##The list of comments is handled differently
+		##in RMassBank, but the flattening should work anyway, if I'm correct(RMassBank uses internal values for comments)
+		commentlist <- list()
+		commentlist <- as.list(substring(grep('COMMENT:',record, value = TRUE, fixed = TRUE),10))
+		mb@compiled_ok[[i]][['COMMENT']] <- list()
+		mb@compiled_ok[[i]][['COMMENT']] <- commentlist
+		chnames <- list()
+		chnames <- as.list(substring(grep('CH$NAME:',record, value = TRUE, fixed = TRUE),10))
+		mb@compiled_ok[[i]][['CH$NAME']] <- chnames
+		mb@compiled_ok[[i]][['CH$COMPOUND_CLASS']] <- substring(grep('CH$COMPOUND_CLASS:',record, value = TRUE, fixed = TRUE),20)
+		mb@compiled_ok[[i]][['CH$FORMULA']] <- substring(grep('CH$FORMULA:',record, value = TRUE, fixed = TRUE),13)
+		mb@compiled_ok[[i]][['CH$EXACT_MASS']] <- as.numeric(substring(grep('CH$EXACT_MASS:',record, value = TRUE, fixed = TRUE),16))
+		mb@compiled_ok[[i]][['CH$SMILES']] <- substring(grep('CH$SMILES:',record, value = TRUE, fixed = TRUE),12)
+		mb@compiled_ok[[i]][['CH$IUPAC']] <- substring(grep('CH$IUPAC:',record, value = TRUE, fixed = TRUE),11)
+	
+		##Again: Flattening this should be no Problem, although the structure is different -
+		##RMassBank names every type of link, but this isn't necessary here since we're only
+		##reading, not creating. If that's a problem, I'll change it.
+		links <- list()
+		links <- as.list(substring(grep('CH$LINK:',record, value = TRUE, fixed = TRUE),10))
+		mb@compiled_ok[[i]][['CH$LINK']] <- links
+	
+		##SP$ will be included later since it's kind of rarely used
+	
+		mb@compiled_ok[[i]][['AC$INSTRUMENT']] <- substring(grep('AC$INSTRUMENT:',record, value = TRUE, fixed = TRUE),16)
+		mb@compiled_ok[[i]][['AC$INSTRUMENT_TYPE']] <- substring(grep('AC$INSTRUMENT_TYPE:',record, value = TRUE, fixed = TRUE),21)
+		##Get the Subvalues just like in RMassBank
+	
+		##RECORD VERSION SPECIFIC READING INCLUDED
+		##This could convert Version 1 -> Version 2 if used right,
+		##Although I have no idea how well it'd do that
+		##I'll have to find the old specifications to do this right, until then it should only kind of work
+		##well enough to do some tests
+		Version <- 2
+		ac_ms <- list()
+		ac_ms[['MS_TYPE']] <- substring(grep('AC$MASS_SPECTROMETRY: MS_TYPE',record, value = TRUE, fixed = TRUE),31)
+		if(length(ac_ms[['MS_TYPE']]) == 0){
+			ac_ms[['MS_TYPE']] <- substring(grep('AC$ANALYTICAL_CONDITION: MS_TYPE',record, value = TRUE, fixed = TRUE),34)
+			Version <- 1
+		}
+		if(Version == 1){
+			##This not a real tag anymore(according to the specifications) but RMassBank still writes it...?
+			##I'll include it for the case that I'm reading V1-records
+			ac_ms[['IONIZATION']] <- substring(grep('AC$MASS_SPECTROMETRY: IONIZATION',record, value = TRUE, fixed = TRUE),34)
+			ac_ms[['ION_MODE']] <- substring(grep('AC$ANALYTICAL_CONDITION: MODE',record, value = TRUE, fixed = TRUE),31)
+			
+		} else{
+				ac_ms[['ION_MODE']] <- substring(grep('AC$MASS_SPECTROMETRY: ION_MODE',record, value = TRUE, fixed = TRUE),32)
+				
+				##Some of the following are part of the (optional) specification, but NOT in RMassBank(!)
+				##This is just for the sake of completeness
+				ac_ms[['COLLISION_ENERGY']] <- substring(grep('AC$MASS_SPECTROMETRY: COLLISION_ENERGY',record, value = TRUE, fixed = TRUE),40)
+				ac_ms[['COLLISION_GAS']] <- substring(grep('AC$MASS_SPECTROMETRY: COLLISION_GAS',record, value = TRUE, fixed = TRUE),37)
+				ac_ms[['DATE']] <- substring(grep('AC$MASS_SPECTROMETRY: DATE',record, value = TRUE, fixed = TRUE),28)
+				ac_ms[['DESOLVATION_GAS_FLOW']] <- substring(grep('AC$MASS_SPECTROMETRY: DESOLVATION_GAS_FLOW',record, value = TRUE, fixed = TRUE),44)
+				ac_ms[['DESOLVATION_TEMPERATURE']] <- substring(grep('AC$MASS_SPECTROMETRY: DESOLVATION_TEMPERATURE',record, value = TRUE, fixed = TRUE),47)
+				ac_ms[['IONIZATION_ENERGY']] <- substring(grep('AC$MASS_SPECTROMETRY: IONIZATION_ENERGY',record, value = TRUE, fixed = TRUE),41)
+				ac_ms[['LASER']] <- substring(grep('AC$MASS_SPECTROMETRY: LASER',record, value = TRUE, fixed = TRUE),29)
+				ac_ms[['MATRIX']] <- substring(grep('AC$MASS_SPECTROMETRY: MATRIX',record, value = TRUE, fixed = TRUE),30)
+				ac_ms[['MASS_ACCURACY']] <- substring(grep('AC$MASS_SPECTROMETRY: MASS_ACCURACY',record, value = TRUE, fixed = TRUE),37)
+				ac_ms[['REAGENT_GAS']] <- substring(grep('AC$MASS_SPECTROMETRY: REAGENT_GAS',record, value = TRUE, fixed = TRUE),35)
+				ac_ms[['SCANNING']] <- substring(grep('AC$MASS_SPECTROMETRY: SCANNING',record, value = TRUE, fixed = TRUE),32)
+				
+				##These are in RMassBank, but not part of the specification?
+				##I think I'm misreading something...
+				#ac_ms[['FRAGMENTATION_MODE']] <- msmsdata$info$mode
+				#ac_ms[['PRECURSOR_TYPE']] <- precursor_types[spec$mode]
+				#ac_ms[['RESOLUTION']] <- msmsdata$info$res
+				
+				ac_lc <- list();
+				ac_lc[['CAPILLARY_VOLTAGE']] <- substring(grep('AC$CHROMATOGRAPHY: CAPILLARY_VOLTAGE',record, value = TRUE, fixed = TRUE),36)
+				ac_lc[['COLUMN_NAME']] <- substring(grep('AC$CHROMATOGRAPHY: COLUMN_NAME',record, value = TRUE, fixed = TRUE),32)
+				ac_lc[['COLUMN_TEMPERATURE']] <- substring(grep('AC$CHROMATOGRAPHY: COLUMN_TEMPERATURE',record, value = TRUE, fixed = TRUE),39)
+				ac_lc[['FLOW_GRADIENT']] <- substring(grep('AC$CHROMATOGRAPHY: FLOW_GRADIENT',record, value = TRUE, fixed = TRUE),34)
+				ac_lc[['FLOW_RATE']] <- substring(grep('AC$CHROMATOGRAPHY: FLOW_RATE',record, value = TRUE, fixed = TRUE),30)
+				ac_lc[['RETENTION_TIME']] <- substring(grep('AC$CHROMATOGRAPHY: RETENTION_TIME',record, value = TRUE, fixed = TRUE),35)
+				ac_lc[['SOLVENT A']] <- substring(grep('AC$CHROMATOGRAPHY: SOLVENT A',record, value = TRUE, fixed = TRUE),30)
+				ac_lc[['SOLVENT B']] <- substring(grep('AC$CHROMATOGRAPHY: SOLVENT B',record, value = TRUE, fixed = TRUE),30)
+				
+				ms_fi <- list()
+				ms_fi[['BASE_PEAK']] <- as.double(substring(grep('MS$FOCUSED_ION: BASE_PEAK',record, value = TRUE, fixed = TRUE),27))
+				ms_fi[['PRECURSOR_M/Z']] <- substring(grep('MS$FOCUSED_ION: PRECURSOR_M/Z',record, value = TRUE, fixed = TRUE),31)
+				ms_fi[['PRECURSOR_TYPE']] <- substring(grep('MS$FOCUSED_ION: PRECURSOR_TYPE',record, value = TRUE, fixed = TRUE),32)
+				
+				if(ac_ms[['MS_TYPE']] == 'MS2'){
+					ms_fi[['PRECURSOR_M/Z']] <- as.double(ms_fi[['PRECURSOR_M/Z']])
+				}
+		}
+		namesAcms <- names(ac_ms)
+		namesAclc <- names(ac_lc)
+		namesMsfi <- names(ms_fi)
+		for(k in 1:length(ac_ms)){
+			if(length(ac_ms[[namesAcms[k]]]) == 0){
+				ac_ms[[namesAcms[k]]] <- NA
+			}
+		}
+		for(k in 1:length(ac_lc)){
+			if(length(ac_lc[[namesAclc[k]]]) == 0){
+				ac_lc[[namesAclc[k]]] <- NA
+			}
+		}
+		for(k in 1:length(ms_fi)){
+			if(length(ms_fi[[namesMsfi[k]]]) == 0){
+				ms_fi[[namesMsfi[k]]] <- NA
+			}
+		}
+		mb@compiled_ok[[i]][['AC$MASS_SPECTROMETRY']] <- list()
+		mb@compiled_ok[[i]][['AC$MASS_SPECTROMETRY']] <- ac_ms
+		mb@compiled_ok[[i]][['AC$CHROMATOGRAPHY']] <- list()
+		mb@compiled_ok[[i]][['AC$CHROMATOGRAPHY']] <- ac_lc
+		mb@compiled_ok[[i]][['MS$FOCUSED_ION']] <- list()
+		mb@compiled_ok[[i]][['MS$FOCUSED_ION']] <- ms_fi
+		
+		##Can currently only read annotations of the type "m/z num {formula mass error(ppm)}"
+		##and'll only read it properly if there is only one annotation
+		##the strange conversion of the data.frames is there so RMassBank can actually write it again
+		PKannotationStart <- grep('PK$ANNOTATION:',record, fixed = TRUE) + 1
+		numpeak <- grep('PK$NUM_PEAK:',record, fixed = TRUE)
+			
+			if(length(PKannotationStart) > 0 && ReadAnnotation == TRUE){
+				if(PKannotationStart < numpeak){
+					splitted <- strsplit(record[PKannotationStart:(numpeak-1)]," ")
+					PKannotation <- matrix(nrow = numpeak - PKannotationStart, ncol = 5)
+					for(k in 1:length(splitted)){
+						splitted[[k]] <- splitted[[k]][which(splitted[[k]] != "")]
+						PKannotation[k,] <- splitted[[k]]
+					}
+					PKannotation <- as.data.frame(PKannotation, stringsAsFactors = FALSE)
+					PKannotation[] <- lapply(PKannotation, type.convert)
+					colnames(PKannotation) <- c("mz", "num", "{formula", "mass", "error(ppm)}")
+					PKannotation$"{formula" <- as.character(PKannotation$"{formula")
+				}
+				mb@compiled_ok[[i]][['PK$ANNOTATION']] <- PKannotation
+			}
+		##Extract the peaks and write the data into a data.frame
+		PKStart <- grep('PK$PEAK:',record, fixed = TRUE) + 1
+		endslash <- grep('//',record, fixed = TRUE)
+			if(PKStart < endslash){
+				splitted <- strsplit(record[PKStart:(endslash-1)]," ")
+				PKPeak <- matrix(nrow = endslash - PKStart, ncol = 3)
+				for(k in 1:length(splitted)){
+					splitted[[k]] <- splitted[[k]][which(splitted[[k]] != "")]
+					PKPeak[k,] <- splitted[[k]]
+				}
+				PKPeak <- as.data.frame(PKPeak, stringsAsFactors = FALSE)
+				PKPeak[] <- lapply(PKPeak, type.convert)
+				colnames(PKPeak) <- c("mz", "int", "rel.int")
+			}
+	
+		mb@compiled_ok[[i]][['PK$PEAK']] <- PKPeak	
+	
+		namesComp <- names(mb@compiled_ok[[i]])
+		for(k in 1:length(mb@compiled_ok[[i]])){
+			if(length(mb@compiled_ok[[i]][[namesComp[k]]]) == 0){
+				mb@compiled_ok[[i]][[namesComp[k]]] <- NA
+			}
+		}
+		print(paste("Read",Files[i]))
+		flush.console()
+	return(mb)
+}
\ No newline at end of file
--- a/R/settings_example.R
+++ b/R/settings_example.R
-
+#' @import yaml
+NULL

 .checkMbSettings <- function()
 {
@@ -13,12 +14,14 @@
 #' Describes all settings for the RMassBank settings file.
 #' 
 #' \itemize{
-#' 		\item{\code{deprofile}}{Whether and how to deprofile input raw files. Leave the 
+#' 		\item{\code{deprofile}}{
+#'   	Whether and how to deprofile input raw files. Leave the 
 #' 			setting empty if your raw files are already in "centroid" mode. If your
 #' 			input files are in profile mode, you have the choice between algorithms
 #' 			\code{\link{deprofile}.spline, deprofile.fwhm, deprofile.localMax}; refer to
 #' 			the individual manpages for more information.}
-#' 		\item{\code{rtMargin, rtShift}}{The allowed retention time deviation relative to the
+#' 		\item{\code{rtMargin, rtShift}}{
+#'   	The allowed retention time deviation relative to the
 #' 			values specified in your compound list (see \code{\link{loadList}}), and the systematic
 #' 			shift (due to the use of, e.g., pre-columns or other special equipment.}
 #' 		\item{\code{babeldir}}{
@@ -28,16 +31,18 @@
 #' 			have explicit hydrogen atoms.
 #'			The path should point to the directory where babel.exe (or the Linux "babel" equivalent) lies.
 #' 			}
-#' 		\item{\code{use_version}}{Which MassBank record format to use; version 2 is strongly advised,
+#' 		\item{\code{use_version}}{
+#'   	Which MassBank record format to use; version 2 is strongly advised,
 #' 			version 1 is considered outdated and should be used only if for some reason you are running
 #' 			old servers and an upgrade is not feasible.}
-#' 		\item{\code{use_rean_peaks}}{Whether to include peaks from reanalysis (see 
+#' 		\item{\code{use_rean_peaks}}{
+#'   	Whether to include peaks from reanalysis (see 
 #' 			\code{\link{reanalyzeFailpeaks}}) in the MassBank records. Boolean, TRUE or FALSE.
 #' 			}
 #' 		\item{\code{annotations}}{
 #' 			A list of constant annotations to use in the MassBank records. The entries
 #' 			\code{authors, copyright, license, instrument, instrument_type, compound_class}
-#' 			correspond to the MassBank entries \code{AUTHORS, COPYRIGHT, LICENSE, AC$INSTRUMENT,
+#' 			correspond to the MassBank entries \code{AUTHORS, COPYRIGHT, PUBLICATION, LICENSE, AC$INSTRUMENT,
 #' 			AC$INSTRUMENT_TYPE, CH$COMPOUND_CLASS}. The entry \code{confidence_comment} is added as
 #' 			\code{COMMENT: CONFIDENCE} entry. 
 #' 
@@ -58,28 +63,100 @@
 #' 			Entries under \code{ms_dataprocessing} are added as \code{MS$DATA_PROCESSING:} entries,
 #' 			in addition to the default \code{WHOLE: RMassBank}.   
 #' 			}
-#' 		\item{\code{spectraList}}{This setting describes the experimental annotations for the single
+#'   	\item{\code{annotator}}{
+#'     For advanced users: option to select your own custom annotator. 
+#'     Check \code{\link{annotator.default}} and the source code for details.}
+#' 		\item{\code{spectraList}}{
+#'   	This setting describes the experimental annotations for the single
 #' 			data-dependent scans. For every data-dependent scan event, a \code{spectraList} entry with
 #' 			\code{mode, ces, ce, res} denoting collision mode, collision energy in short and verbose 
 #' 			notation, and FT resolution.}
-#' 		\item{\code{accessionNumberShifts}}{This denotes the starting points for accession numbers
+#' 		\item{\code{accessionNumberShifts}}{
+#'   	This denotes the starting points for accession numbers
 #' 			for different ion types. For example, \code{pH: 0, mH: 50} means that [M+H]+ spectra will
 #' 			start at \code{XX123401} (\code{XX} being the \code{entry_prefix} and \code{1234} the compound
 #' 			id) and [M-H]- will start at \code{XX123451}.}
-#' 		\item{\code{electronicNoise, electronicNoiseWidth}}{Known electronic noise peaks and the window
+#' 		\item{\code{electronicNoise, electronicNoiseWidth}}{
+#'   	Known electronic noise peaks and the window
 #' 			to be used by \code{\link{cleanElnoise}}}
-#' 		\item{\code{recalibrateBy}}{\code{dppm} or \code{dmz} to recalibrate either by delta ppm or by
+#' 		\item{\code{recalibrateBy}}{
+#'   	\code{dppm} or \code{dmz} to recalibrate either by delta ppm or by
 #' 			delta mz.}
-#' 		\item{\code{recalibrateMS1}}{\code{common} or \code{separate} to recalibrate MS1 data points together
+#' 		\item{\code{recalibrateMS1}}{
+#'   	\code{common} or \code{separate} to recalibrate MS1 data points together
 #' 			or separately from MS2 data points.}
-#' 		\item{\code{recalibrator: MS1, MS2}}{The functions to use for recalibration of MS1 and MS2 data points.
+#' 		\item{\code{recalibrator: MS1, MS2}}{
+#'   	The functions to use for recalibration of MS1 and MS2 data points.
 #' 			Note that the \code{MS1} setting is only meaningful if \code{recalibrateMS1: separate}, otherwise
 #' 			the \code{MS2} setting is used for a common recalibration curve. See \code{\link{recalibrate.loess}}
-#' 			for details.
-#' 			}
+#' 			for details.}
+#'   	\item{\code{multiplicityFilter}}{
+#'     Define the multiplicity filtering level. Default is 2, a value of 1 
+#'     is off (no filtering) and >2 is harsher filtering.}
+#'     \item{\code{titleFormat}}{
+#'     The title of MassBank records is a mini-summary
+#'     of the record, for example "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+". 
+#'     By default, the first compound name \code{CH$NAME}, instrument type 
+#'     \code{AC$INSTRUMENT_TYPE}, MS/MS type \code{AC$MASS_SPECTROMETRY: MS_TYPE}, 
+#'     collision energy \code{RECORD_TITLE_CE}, resolution \code{AC$MASS_SPECTROMETRY: RESOLUTION}
+#'     and precursor \code{MS$FOCUSED_ION: PRECURSOR_TYPE} are used. If alternative 
+#'     information is relevant to differentiate acquired spectra, the title should be adjusted.
+#'     For example, many TOFs do not have a resolution setting. 
+#'     See MassBank documentation for more.}
+#'   	\item{\code{filterSettings}}{
+#' 			A list of settings that affect the MS/MS processing. The entries
+#' 			\code{ppmHighMass, ppmLowMass, massRangeDivision} set values for 
+#'   		pre-processing, prior to recalibration. \code{ppmHighMass} defines the 
+#'     	ppm error for the high mass range (default 10 ppm for Orbitraps), 
+#'       \code{ppmLowMass} is the error for the low mass range (default 15 ppm 
+#'       for Orbitraps) and \code{massRangeDivision} is the m/z value defining 
+#'       the split between the high and low mass range (default m/z = 120).
+#' 
+#' 			The entry \code{ppmFine} defines the ppm cut-off post recalibration. 
+#'   		The default value of 5 ppm is recommended for Orbitraps. For other 
+#'     	instruments this can be interpreted from the recalibration plot.
+#'      All ppm limits are one-sided (e.g. this includes values to +5 ppm or -5 ppm 
+#'      deviation from the exact mass).
+#' 			
+#' 			The entries \code{prelimCut, prelimCutRatio} define the intensity cut-off and 
+#'   		cut-off ratio (in % of the most intense peak) for pre-processing. This affects 
+#'     	the peak selection for the recalibration only. Careful: the default value 
+#'       1e4 for Orbitrap LTQ positive mode could remove all peaks for TOF data 
+#'       and will remove too many peaks for Orbitrap LTQ negative mode spectra!
+#' 
+#' 			The entry \code{specOKLimit} defines the intensity limit to include MS/MS spectra.
+#'   		MS/MS spectra must have at least one peak above this limit to proceed through 
+#'     	the workflow.
+#' 
+#' 			\code{dbeMinLimit} defines the minimum allowable ring and double bond equivalents (DBE) 
+#'   		allowed for assigned formulas. This assumes maximum valuences for elements with 
+#'     	multiple valence states. The default is -0.5 (accounting for fragments being ions).
+#' 
+#' 			The entries \code{satelliteMzLimit, satelliteIntLimit} define the cut-off m/z and 
+#'   		intensity values for satellite peak removal (an artefact of Fourier Transform 
+#'     	processing). All peaks within the m/z limit (default 0.5) and intensity ratio 
+#'       (default 0.05 or 5 %) of the respective peak will be removed. Applicable to 
+#'       Fourier Transform instruments only (e.g. Orbitrap).   
+#' 			}  
+#'     \item{\code{filterSettings}}{
+#' 			Parameters for adjusting the raw data retrieval. 
+#'   		The entry \code{ppmFine} defines the ppm error to look for the precursor in 
+#'     	the MS1 (parent) spectrum. Default is 10 ppm for Orbitrap.
+#' 
+#' 			\code{mzCoarse} defines the error to search for the precursor specification 
+#'   		in the MS2 spectrum. This is often only saved to 2 decimal places and thus 
+#'     	can be quite inaccurate. The accuracy also depends on the isolation window used. 
+#'       The default settings (for e.g. Orbitrap) is 0.5 (Da, or Th for m/z).
+#' 
+#' 			The entry \code{fillPrecursorScan} is largely untested. The default value 
+#'   		(FALSE) assumes all necessary precursor information is available in the mzML file.
+#'     	A setting ot TRUE tries to fill in the precursor data scan number if it is missing.
+#'       Only tested on one case study so far - feedback welcome!   
+#' 			}  
 #' }
 #' 
 #' 
+#' @author Michael Stravs, Emma Schymanski
 #' @seealso \code{\link{loadRmbSettings}}
 #' @rdname RmbSettings
 #' @name RmbSettings
@@ -184,11 +261,55 @@ NULL
  recalibrator = list(
 	MS1 = "recalibrate.loess",
 	MS2 = "recalibrate.loess"),
+# Window width to look for MS1 peaks to recalibrate (in ppm)
+	recalibrateMS1Window= 15,
+
  # Define the multiplicity filtering level
  # Default is 2 (peak occurs at least twice)
  # Set this to 1 if you want to turn this option off.
  # Set this to anything > 2 if you want harder filtering
-  multiplicityFilter = 2
+  multiplicityFilter = 2,
+	# Define the title format.
+	# You can use all entries from MassBank records as tokens
+	# plus the additional token RECORD_TITLE_CE, which is a shortened
+	# version of the collision energy specifically for use in the title.
+	# Every line is one entry and must have one token in curly brackets
+	# e.g. {CH$NAME} or {AC$MASS_SPECTROMETRY: MS_TYPE} plus optionally
+	# additional text in front or behind e.g.
+	# R={AC$MASS_SPECTROMETRY: RESOLUTION}
+	# If this is not specified, it defaults to a title of the format
+	# "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+"
+  titleFormat = c(
+		  "{CH$NAME}",
+		  "{AC$INSTRUMENT_TYPE}",
+		  "{AC$MASS_SPECTROMETRY: MS_TYPE}",
+		  "CE: {RECORD_TITLE_CE}",
+		  "R={AC$MASS_SPECTROMETRY: RESOLUTION}",
+		  "{MS$FOCUSED_ION: PRECURSOR_TYPE}"
+  ),
+# Define filter settings.
+# For Orbitrap, settings of 15 ppm in low mass range, 10 ppm in high
+# mass range, m/z = 120 as mass range division and 5 ppm for recalibrated
+# data overall are recommended. 
+  filterSettings = list(
+		  	ppmHighMass = 10,
+  			ppmLowMass = 15,
+		  massRangeDivision= 120,
+		  ppmFine= 5,
+		  prelimCut= 1e4,
+		  prelimCutRatio= 0,
+		  fineCut= 0,
+		  fineCutRatio= 0,
+		  specOkLimit= 1e4,
+		  dbeMinLimit= -0.5,
+		  satelliteMzLimit= 0.5,
+		  satelliteIntLimit= 0.05
+  	),
+	
+	findMsMsRawSettings = list(
+			ppmFine= 10,
+			mzCoarse= 0.5,
+			fillPrecursorScan= FALSE)
  )

 # Writes a file with sample settings which the user can adjust with his values.

--- a/R/tools.R
+++ b/R/tools.R
@@ -33,7 +33,7 @@
 #'  w1 <- msmsWorkflow(w, steps=c(1:7), mode="pH")
 #'  w2 <- msmsWorkflow(w, steps=c(1:7), mode="pH", confirmMode = 1)
 #'  wTotal <- combineMultiplicities(c(w1, w2))
-#'  wTotal <- msmsWorkflow(wTotal, steps=8, mode="pH", archiveName = "output")
+#'  wTotal <- msmsWorkflow(wTotal, steps=8, mode="pH", archivename = "output")
 #'  # continue here with mbWorkflow 
 #' }
 #' 
@@ -54,3 +54,33 @@ combineMultiplicities <- function(workspaces)
 	
 	return(wOut)
 }
+
+
+#' Determine processed steps
+#' 
+#' This function reads out the content of different slots of the \code{workspace}
+#' object and finds out which steps have already been processed on it.
+#' 
+#' @param workspace A \code{msmsWorkspace} object. 
+#' 
+#' @return An array containing all \code{msmsWorkflow} steps which have 
+#' likely been processed.  
+#' 
+#' @examples \dontrun{
+#' findProgress(w)
+#' }
+#' @author Stravs MA, Eawag <michael.stravs@@eawag.ch>
+#' @export
+findProgress <- function(workspace)
+{
+    step1 <- (length(workspace@specs) > 0)
+    step2 <- (length(workspace@analyzedSpecs) > 0)
+    step3 <- (length(workspace@aggregatedSpecs) > 0)
+    step4 <- (length(workspace@recalibratedSpecs) > 0)
+    step5 <- (length(workspace@analyzedRcSpecs) > 0)
+    step6 <- (length(workspace@aggregatedRcSpecs) > 0)
+    step7 <- (length(workspace@reanalyzedRcSpecs) > 0)
+    step8 <- (length(workspace@refilteredRcSpecs) > 0)
+    steps <- which(c(step1, step2, step3, step4, step5, step6, step7, step8))
+    return(steps)
+}
--- a/R/validateMassBank.R
+++ b/R/validateMassBank.R
+#' Validate MassBank records with a set of Unit tests
+#' 
+#' Validates a plain text MassBank record, or recursively all
+#' records within a directory. The Unit Tests to be used are
+#' installed in RMassBank/inst/unitTests and currently include 
+#' checks for NAs, peaks versus precursor, precursor mz, 
+#' precursor type, SMILES vs exact mass, total intensities and
+#' title versus type. The validation report is saved as 
+#' "report.html" in the working directory.
+#' 
+#' @aliases validate
+#' @usage validate(path)
+#' @param path The filepath to a single record, or a directory to search recursively
+#' @examples
+#' \dontrun{
+#' validate("/tmp/MassBank/OpenData/record/")
+#' }
+#' @export
+validate <- function(path) {
+
+        if (!require(ontoCAT)) {
+          stop("Package ontoCAT missing. Validation requires package ontoCAT and RUnit")
+        }
+
+        if (!require(RUnit)) {
+          stop("Package RUnit missing. Validation requires package ontoCAT and RUnit")
+        }
+
+	# Is the argument a directory?
+	# If yes, list the files
+	RMassBank.env$Instrument_List <- .getInstruments()
+	RMassBank.env$testnumber <- 1
+	if(file.info(path[1])$isdir){
+	    Files <- list.files(path = path,
+                                recursive=TRUE, 
+                                full.names = TRUE)
+	} else {Files <- path}
+	# Parsing with the help the parseMassBank-function
+	RMassBank.env$mb <- lapply(Files,parseMassBank)
+	# Test RMassBank Objects with RUnit
+	# This loop creates the tests and defines one test suite for every record
+	tests <- list()
+	for(i in 1:length(RMassBank.env$mb)){
+		if(RMassBank.env$mb[[i]]@compiled_ok[[1]][['AC$MASS_SPECTROMETRY']][['MS_TYPE']] == "MS2" || RMassBank.env$mb[[i]]@compiled_ok[[1]][['AC$MASS_SPECTROMETRY']][['MS_TYPE']] == "MS"){
+		tests[[i]] <- defineTestSuite(Files[i], dirs = system.file(package="RMassBank", "unitTests"), testFileRegexp = "runit.MS2.test.R",
+                #testFuncRegexp = "^test.+",
+                rngKind = "Marsaglia-Multicarry",
+                rngNormalKind = "Kinderman-Ramage")
+		} else{
+			tests[[i]] <- defineTestSuite(Files[i], dirs = system.file(package="RMassBank", "unitTests"), testFileRegexp = "^runit.MSn.test.[rR]$",
+                #testFuncRegexp = "^test.+",
+                rngKind = "Marsaglia-Multicarry",
+                rngNormalKind = "Kinderman-Ramage")
+		}
+	}
+	print("Starting Tests")
+	# Testing the list of Testsuites
+	testData <- runTestSuite(tests)
+	# Prints the HTML-record
+	printHTMLProtocol(testData, fileName = paste(getwd(),"/report.html", sep = ""))
+	print(paste("Report for the file(s) finished"))
+}
+
+# This function checks if an .obo-file is readable for ontoCAT
+.isOboReadable <- function(filename){
+
+	# getOntology() has a problem with reading relative Windows paths(it wants an URI),
+	# so the path has to be made absolute
+	# I reckon this should work under Linux without doing that
+	ont <- getOntology(normalizePath(filename))
+	if(is.null(getOntologyAccession(ont))){
+		return(FALSE)
+	}
+	return(TRUE)
+}
+
+# This function downloads the psi-ms.obo-ontology so we can get the allowed instrument-names
+# This is a _temporary_ fix until I find out why getOntology() doesn't work when there are "import:"-lines in the .obo-file
+# Until then I will simply remove them, because we don't need the imported ontologies
+.downloadPsiObo <- function(){
+		connPsiObo <- url("http://psidev.cvs.sourceforge.net/viewvc/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo")
+		oboFile <- readLines(connPsiObo)
+		close(connPsiObo)
+		oboFile <- oboFile[-grep("import:",oboFile)] 
+		connLocal <- file("psi-ms.obo")
+		writeLines(oboFile,connLocal)
+		close(connLocal)
+}
+
+# Checks if the psi-ms.obo is there
+# Will be converted to "checkforinstruments" as soon as I can find the problem
+# with getOntology()
+.checkForPsiMs <- function(){
+	
+	if(file.exists("psi-ms.obo")){
+		if(.isOboReadable("psi-ms.obo")){
+			print("It seems that you have a working psi-ms.obo, do you want to update it? [y/n]")
+			while(TRUE){
+				answer <- readLines(stdin(), n=1, warn=FALSE)
+				if(answer == "y"){
+					.downloadPsiObo()
+					return(TRUE)
+				}
+				if(answer == "n"){
+					return(TRUE)
+				}
+				print("Please type exactly y or n")
+			}
+		}
+	}
+	.downloadPsiObo()
+	return(TRUE)
+}
+
+# This is a list of the possible instrument names 
+.getInstruments <- function(){
+	Onto <- getOntology(system.file(package = "RMassBank", "psi-ms.obo"))
+	instrumentTerms <- getAllTermChildrenById(Onto,"MS_1000031")
+	instruments <- vector()	
+	for(i in 1:length(instrumentTerms)){
+		instruments[i] <- getLabel(instrumentTerms[[i]])
+	}
+	return(instruments)
+}
+
+#' Calculate the mass from a SMILES-String
+#' 
+#' Uses a SMILES-String to calculate the mass using rcdk-integrated functions.
+#'
+#' @aliases smiles2mass
+#' @usage smiles2mass(SMILES)
+#' @param SMILES A String-object representing a SMILES
+#' @return The calculated mass of the given SMILES-Formula
+#' @author Erik Mueller
+#' @examples \dontrun{
+#' 		smiles2mass("CC(=O)NC(C(O)1)C(O)C(OC(O2)C(O)C(OC(O3)C(O)C(O)C(O)C(CO)3)C(O)C(CO)2)C(CO)O1")
+#' }
+#' @export
+smiles2mass <- function(SMILES){
+	massfromformula <- parse.smiles(SMILES)[[1]]
+	do.typing(massfromformula)
+	do.aromaticity(massfromformula)
+	convert.implicit.to.explicit(massfromformula)
+	do.isotopes(massfromformula)
+	mass <- get.exact.mass(massfromformula)
+	return(mass)
+}
--- a/R/zzz.R
+++ b/R/zzz.R
+.onLoad <- function(libname, pkgname) {
+  RMassBank.env <<- new.env()
+  RMassBank.env$ReadAnnotation <- FALSE
+  RMassBank.env$testnumber <- 1
+  mb <- list()
+  attach(RMassBank.env)
+}
\ No newline at end of file
--- a/inst/CITATION
+++ b/inst/CITATION
@@ -6,19 +6,22 @@

 citHeader("To cite package 'RMassBank' in publications use:")

-year <- sub(".*(2[[:digit:]]{3})-.*", "\\1", meta$Date, perl = TRUE)
+year <- 2013

-citEntry(entry="unpublished",
+citEntry(entry="article",
 		title = "Automatic Recalibration and Processing of Tandem Mass Spectra using Formula Annotation.",
 		author = personList(as.person("Michael A. Stravs"),
 				as.person("Emma L. Schymanski"),
 				as.person("Heinz Singer"),
 				as.person("Juliane Hollender")),
-		year = year,
-		note = "in preparation",
+		year = 2013,
+		journal = "Journal of Mass Spectrometry",
+		volume = 48, 
+		number = 1,
+		pages = 89--99,
 		
 		textVersion =
-				paste("M. A. Stravs, E. L. Schymanski, H. Singer, J. Hollender, ",
-						"Automatic Recalibration and Processing of Tandem Mass Spectra using Formula Annotation. ",
-						year,", ",
-						"in preparation.", sep=""))
+				paste("M. A. Stravs, E. L. Schymanski, H. Singer, J. Hollender", year,
+						"Automatic Recalibration and Processing of Tandem Mass Spectra using Formula Annotation",
+						"Journal of Mass Spectrometry", "48(1)",
+						"89-99.", sep=", "))
--- a/inst/RMB_options.ini
+++ b/inst/RMB_options.ini
@@ -64,6 +64,13 @@ annotations:
    ms_dataprocessing:
        RECALIBRATE: loess on assigned fragments and MS1

+# Annotator:
+# by default, "annotator.default" is used.
+# If you want to build your custom annotator (check ?annotator.default and the source code),
+# select it here by using e.g.
+# annotator: annotator.myown
+# for a function annotator.myown(annotation)
+
 # List of data-dependent scans in their order (relative to the parent scan), for annotation of the MassBank records
 # For every data-dependent scan event, specify an element with:
 # mode: fragmentation mode, e.g. CID
@@ -158,6 +165,8 @@ recalibrateBy: dppm
 # with common curve (common)
 # do not recalibrate (none)
 recalibrateMS1: common
+# Window width to look for MS1 peaks to recalibrate (in ppm)
+recalibrateMS1Window: 15

 # Custom recalibration function: You can overwrite the recal function by
 # making any function which takes rcdata$recalfield ~ rcdata$mzFound.
@@ -174,3 +183,49 @@ recalibrator:
 # Set this to 1 if you want to turn this option off.
 # Set this to anything > 2 if you want harder filtering
 multiplicityFilter: 2
+
+# Define the title format.
+# You can use all entries from MassBank records as tokens
+# plus the additional token RECORD_TITLE_CE, which is a shortened
+# version of the collision energy specifically for use in the title.
+# Every line is one entry and must have one token in curly brackets
+# e.g. {CH$NAME} or {AC$MASS_SPECTROMETRY: MS_TYPE} plus optionally
+# additional text in front or behind e.g.
+# R={AC$MASS_SPECTROMETRY: RESOLUTION}
+# If this is not specified, it defaults to a title of the format
+# "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+"
+# Note how everything must be in "" here because otherwise the : are getting mangled!
+titleFormat:
+- "{CH$NAME}"
+- "{AC$INSTRUMENT_TYPE}"
+- "{AC$MASS_SPECTROMETRY: MS_TYPE}"
+- "CE: {RECORD_TITLE_CE}"
+- "R={AC$MASS_SPECTROMETRY: RESOLUTION}"
+- "{MS$FOCUSED_ION: PRECURSOR_TYPE}"
+
+# Define filter settings.
+# For Orbitrap, settings of 15 ppm in low mass range, 10 ppm in high
+# mass range, m/z = 120 as mass range division and 5 ppm for recalibrated
+# data overall are recommended. 
+filterSettings:
+    ppmHighMass: 10
+    ppmLowMass: 15
+    massRangeDivision: 120
+    ppmFine: 5
+    prelimCut: 1e4
+    prelimCutRatio: 0
+    fineCut: 0
+    fineCutRatio: 0
+    specOkLimit: 1e4
+    dbeMinLimit: -0.5
+    satelliteMzLimit: 0.5
+    satelliteIntLimit: 0.05
+    
+ # Define raw MS retrieval settings.
+findMsMsRawSettings:
+    ppmFine: 10
+    mzCoarse: 0.5
+    # fillPrecursorScan is FALSE for "good" mzML files which have all the info needed.
+    # However, for example AB Sciex files will have missing precursor scan information,
+    # in which case fillPrecursorScan = TRUE is needed. Try it out.
+    fillPrecursorScan: FALSE
--- a/inst/psi-ms.obo
+++ b/inst/psi-ms.obo
--- a/inst/unitTests/runit.MS2.instruments.R-disabled
+++ b/inst/unitTests/runit.MS2.instruments.R-disabled
+test.instrumentname <- function(){
+	Instrument_Name <- RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['AC$INSTRUMENT']]
+	checkTrue(Instrument_Name %in% RMassBank.env$Instrument_List)
+}
--- a/inst/unitTests/runit.MS2.test.R
+++ b/inst/unitTests/runit.MS2.test.R
+test.NA <- function(){
+	checkTrue(!(NA %in% as.matrix(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['PK$PEAK']])))
+}
+
+test.peaksvsprecursor <- function(){
+	Max_Peak <- unname(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['PK$PEAK']][dim(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['PK$PEAK']])[1],1])
+	Precursor <- RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_M/Z']]
+	if(is.na(Precursor)){
+		checkTrue(TRUE)
+	}else{
+		checkEquals(Max_Peak, Precursor, tolerance = Precursor/100)
+	}
+}
+
+test.precursormz <- function(){
+	precursorlist <- c("[M+H]+","[M+Na]+","[M-H]-","[M+HCOO-]-","[M]+","[M]-")
+	if(is.na(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_TYPE']])){
+		checkTrue(TRUE)
+	} else{
+		precursor <- grep(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_TYPE']],precursorlist, value = TRUE, fixed = TRUE)
+		if(precursor == "[M+H]+"){
+		checkEquals(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_M/Z']],RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['CH$EXACT_MASS']] + 1.008,tolerance = 0.002)
+		}
+		if(precursor == "[M+Na]+"){
+			checkEquals(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_M/Z']],RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['CH$EXACT_MASS']] + 22.989,tolerance = 0.002)
+		}
+		if(precursor == "[M-H]-"){
+			checkEquals(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_M/Z']],RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['CH$EXACT_MASS']] - 1.008,tolerance = 0.002)
+		}
+		if(precursor == "[M+HCOO-]-"){
+			checkEquals(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_M/Z']],RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['CH$EXACT_MASS']] + 45.017,tolerance = 0.002)
+		}
+	}
+}
+
+test.PrecursorType <- function(){    
+	precursorlist <- c("[M+H]+","[M+Na]+","[M-H]-","[M+HCOO-]-","[M]+","[M]-")
+	if(is.na(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_TYPE']])){
+		checkTrue(TRUE)
+	}else{
+	checkTrue(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_TYPE']] %in% precursorlist)
+	}
+}
+
+test.smilesvsexactmass <- function(){
+	Mass_Calculated_Through_Smiles <- smiles2mass(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['CH$SMILES']])
+	Exact_Mass <- RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['CH$EXACT_MASS']]
+	checkEquals(Mass_Calculated_Through_Smiles, Exact_Mass, tolerance = Exact_Mass/100)
+}
+
+test.sumintensities <- function(){
+	sumOfIntensities <- sum(RMassBank.env$mb[[RMassBank.env$testnumber]]@compiled_ok[[1]][['PK$PEAK']][,2])
+	checkTrue(sumOfIntensities > 0)
+}
+
+test.TitleVsType <- function(){
+		RMassBank.env$testnumber <- RMassBank.env$testnumber + 1
+		if(is.na(RMassBank.env$mb[[RMassBank.env$testnumber-1]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_TYPE']])){
+			checkTrue(TRUE)
+		}else{
+		checkTrue(grepl(RMassBank.env$mb[[RMassBank.env$testnumber-1]]@compiled_ok[[1]][['MS$FOCUSED_ION']][['PRECURSOR_TYPE']], RMassBank.env$mb[[RMassBank.env$testnumber-1]]@compiled_ok[[1]][['RECORD_TITLE']], fixed = TRUE))
+		}
+}
--- a/inst/unitTests/runit.MSn.test.slashes.R
+++ b/inst/unitTests/runit.MSn.test.slashes.R
+test.slashes <- function(){
+	Type <- as.numeric(substring(mb@compiled_ok[[testNumber]][['AC$MASS_SPECTROMETRY']][['MS_TYPE']],first = 3))
+	slashes <- length(gregexpr('/', mb@compiled_ok[[testNumber]][['MS$FOCUSED_ION']][['PRECURSOR_M/Z']]))
+	checkEquals(Type - 2,slashes)
+}
\ No newline at end of file
--- a/man/RmbSettings.Rd
+++ b/man/RmbSettings.Rd
@@ -5,14 +5,14 @@
  Describes all settings for the RMassBank settings file.
 }
 \details{
-  \itemize{ \item{\code{deprofile}}{Whether and how to
+  \itemize{ \item{\code{deprofile}}{ Whether and how to
  deprofile input raw files. Leave the setting empty if
  your raw files are already in "centroid" mode. If your
  input files are in profile mode, you have the choice
  between algorithms \code{\link{deprofile}.spline,
  deprofile.fwhm, deprofile.localMax}; refer to the
  individual manpages for more information.}
-  \item{\code{rtMargin, rtShift}}{The allowed retention
+  \item{\code{rtMargin, rtShift}}{ The allowed retention
  time deviation relative to the values specified in your
  compound list (see \code{\link{loadList}}), and the
  systematic shift (due to the use of, e.g., pre-columns or
@@ -24,11 +24,11 @@
  OpenBabel; the CACTUS structures have explicit hydrogen
  atoms.  The path should point to the directory where
  babel.exe (or the Linux "babel" equivalent) lies.  }
-  \item{\code{use_version}}{Which MassBank record format to
-  use; version 2 is strongly advised, version 1 is
+  \item{\code{use_version}}{ Which MassBank record format
+  to use; version 2 is strongly advised, version 1 is
  considered outdated and should be used only if for some
  reason you are running old servers and an upgrade is not
-  feasible.} \item{\code{use_rean_peaks}}{Whether to
+  feasible.} \item{\code{use_rean_peaks}}{ Whether to
  include peaks from reanalysis (see
  \code{\link{reanalyzeFailpeaks}}) in the MassBank
  records. Boolean, TRUE or FALSE.  }
@@ -36,10 +36,10 @@
  to use in the MassBank records. The entries
  \code{authors, copyright, license, instrument,
  instrument_type, compound_class} correspond to the
-  MassBank entries \code{AUTHORS, COPYRIGHT, LICENSE,
-  AC$INSTRUMENT, AC$INSTRUMENT_TYPE, CH$COMPOUND_CLASS}.
-  The entry \code{confidence_comment} is added as
-  \code{COMMENT: CONFIDENCE} entry.
+  MassBank entries \code{AUTHORS, COPYRIGHT, PUBLICATION,
+  LICENSE, AC$INSTRUMENT, AC$INSTRUMENT_TYPE,
+  CH$COMPOUND_CLASS}. The entry \code{confidence_comment}
+  is added as \code{COMMENT: CONFIDENCE} entry.

  The entry \code{internal_id_fieldname} is used to name
  the MassBank entry which will keep a reference to the
@@ -63,32 +63,116 @@
  Entries under \code{ms_dataprocessing} are added as
  \code{MS$DATA_PROCESSING:} entries, in addition to the
  default \code{WHOLE: RMassBank}.  }
-  \item{\code{spectraList}}{This setting describes the
-  experimental annotations for the single data-dependent
-  scans. For every data-dependent scan event, a
-  \code{spectraList} entry with \code{mode, ces, ce, res}
-  denoting collision mode, collision energy in short and
-  verbose notation, and FT resolution.}
-  \item{\code{accessionNumberShifts}}{This denotes the
+  \item{\code{annotator}}{ For advanced users: option to
+  select your own custom annotator.  Check
+  \code{\link{annotator.default}} and the source code for
+  details.} \item{\code{spectraList}}{ This setting
+  describes the experimental annotations for the single
+  data-dependent scans. For every data-dependent scan
+  event, a \code{spectraList} entry with \code{mode, ces,
+  ce, res} denoting collision mode, collision energy in
+  short and verbose notation, and FT resolution.}
+  \item{\code{accessionNumberShifts}}{ This denotes the
  starting points for accession numbers for different ion
  types. For example, \code{pH: 0, mH: 50} means that
  [M+H]+ spectra will start at \code{XX123401} (\code{XX}
  being the \code{entry_prefix} and \code{1234} the
  compound id) and [M-H]- will start at \code{XX123451}.}
-  \item{\code{electronicNoise, electronicNoiseWidth}}{Known
-  electronic noise peaks and the window to be used by
-  \code{\link{cleanElnoise}}}
-  \item{\code{recalibrateBy}}{\code{dppm} or \code{dmz} to
-  recalibrate either by delta ppm or by delta mz.}
-  \item{\code{recalibrateMS1}}{\code{common} or
-  \code{separate} to recalibrate MS1 data points together
-  or separately from MS2 data points.}
-  \item{\code{recalibrator: MS1, MS2}}{The functions to use
-  for recalibration of MS1 and MS2 data points.  Note that
-  the \code{MS1} setting is only meaningful if
+  \item{\code{electronicNoise, electronicNoiseWidth}}{
+  Known electronic noise peaks and the window to be used by
+  \code{\link{cleanElnoise}}} \item{\code{recalibrateBy}}{
+  \code{dppm} or \code{dmz} to recalibrate either by delta
+  ppm or by delta mz.} \item{\code{recalibrateMS1}}{
+  \code{common} or \code{separate} to recalibrate MS1 data
+  points together or separately from MS2 data points.}
+  \item{\code{recalibrator: MS1, MS2}}{ The functions to
+  use for recalibration of MS1 and MS2 data points.  Note
+  that the \code{MS1} setting is only meaningful if
  \code{recalibrateMS1: separate}, otherwise the \code{MS2}
  setting is used for a common recalibration curve. See
-  \code{\link{recalibrate.loess}} for details.  } }
+  \code{\link{recalibrate.loess}} for details.}
+  \item{\code{multiplicityFilter}}{ Define the multiplicity
+  filtering level. Default is 2, a value of 1 is off (no
+  filtering) and >2 is harsher filtering.}
+  \item{\code{titleFormat}}{ The title of MassBank records
+  is a mini-summary of the record, for example
+  "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+".
+  By default, the first compound name \code{CH$NAME},
+  instrument type \code{AC$INSTRUMENT_TYPE}, MS/MS type
+  \code{AC$MASS_SPECTROMETRY: MS_TYPE}, collision energy
+  \code{RECORD_TITLE_CE}, resolution
+  \code{AC$MASS_SPECTROMETRY: RESOLUTION} and precursor
+  \code{MS$FOCUSED_ION: PRECURSOR_TYPE} are used. If
+  alternative information is relevant to differentiate
+  acquired spectra, the title should be adjusted.  For
+  example, many TOFs do not have a resolution setting.  See
+  MassBank documentation for more.}
+  \item{\code{filterSettings}}{ A list of settings that
+  affect the MS/MS processing. The entries
+  \code{ppmHighMass, ppmLowMass, massRangeDivision} set
+  values for pre-processing, prior to recalibration.
+  \code{ppmHighMass} defines the ppm error for the high
+  mass range (default 10 ppm for Orbitraps),
+  \code{ppmLowMass} is the error for the low mass range
+  (default 15 ppm for Orbitraps) and
+  \code{massRangeDivision} is the m/z value defining the
+  split between the high and low mass range (default m/z =
+  120).
+
+  The entry \code{ppmFine} defines the ppm cut-off post
+  recalibration.  The default value of 5 ppm is recommended
+  for Orbitraps. For other instruments this can be
+  interpreted from the recalibration plot.  All ppm limits
+  are one-sided (e.g. this includes values to +5 ppm or -5
+  ppm deviation from the exact mass).
+
+  The entries \code{prelimCut, prelimCutRatio} define the
+  intensity cut-off and cut-off ratio (in % of the most
+  intense peak) for pre-processing. This affects the peak
+  selection for the recalibration only. Careful: the
+  default value 1e4 for Orbitrap LTQ positive mode could
+  remove all peaks for TOF data and will remove too many
+  peaks for Orbitrap LTQ negative mode spectra!
+
+  The entry \code{specOKLimit} defines the intensity limit
+  to include MS/MS spectra.  MS/MS spectra must have at
+  least one peak above this limit to proceed through the
+  workflow.
+
+  \code{dbeMinLimit} defines the minimum allowable ring and
+  double bond equivalents (DBE) allowed for assigned
+  formulas. This assumes maximum valuences for elements
+  with multiple valence states. The default is -0.5
+  (accounting for fragments being ions).
+
+  The entries \code{satelliteMzLimit, satelliteIntLimit}
+  define the cut-off m/z and intensity values for satellite
+  peak removal (an artefact of Fourier Transform
+  processing). All peaks within the m/z limit (default 0.5)
+  and intensity ratio (default 0.05 or 5 %) of the
+  respective peak will be removed. Applicable to Fourier
+  Transform instruments only (e.g. Orbitrap).  }
+  \item{\code{filterSettings}}{ Parameters for adjusting
+  the raw data retrieval.  The entry \code{ppmFine} defines
+  the ppm error to look for the precursor in the MS1
+  (parent) spectrum. Default is 10 ppm for Orbitrap.
+
+  \code{mzCoarse} defines the error to search for the
+  precursor specification in the MS2 spectrum. This is
+  often only saved to 2 decimal places and thus can be
+  quite inaccurate. The accuracy also depends on the
+  isolation window used.  The default settings (for e.g.
+  Orbitrap) is 0.5 (Da, or Th for m/z).
+
+  The entry \code{fillPrecursorScan} is largely untested.
+  The default value (FALSE) assumes all necessary precursor
+  information is available in the mzML file.  A setting ot
+  TRUE tries to fill in the precursor data scan number if
+  it is missing.  Only tested on one case study so far -
+  feedback welcome!  } }
+}
+\author{
+  Michael Stravs, Emma Schymanski
 }
 \seealso{
  \code{\link{loadRmbSettings}}