Merge branch 'bioc2014' into refactoring and some fixes by hand

Conflicts: DESCRIPTION R/createMassBank.R R/leMsmsRaw.R

Merge branch 'bioc2014' into refactoring and some fixes by hand
Conflicts: DESCRIPTION R/createMassBank.R R/leMsmsRaw.R
26d217eb · Michele Stravs · 61adc3d7 · 1055dc92 · 26d217eb · 26d217eb
Commit 26d217eb authored 9 years ago by Michele Stravs
--- a/DESCRIPTION
+++ b/DESCRIPTION
 Package: RMassBank
 Type: Package
 Title: Workflow to process tandem MS files and build MassBank records
-Version: 1.9.5.2
+Version: 1.9.7
 Authors@R: c(
    person(given = "RMassBank at Eawag", email = "massbank@eawag.ch",
    role=c("cre")),

--- a/R/createMassBank.R
+++ b/R/createMassBank.R
@@ -244,7 +244,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
 	  # check which compounds have useful spectra
 	  mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0))
 	  mb@problems <- which(is.na(mb@compiled))
-	  mb@compiled_ok <- mb@compiled[mb@ok]    
+	  mb@compiled_ok <- mb@compiled[mb@ok]
  }
  # Step 5: Convert the internal tree-like representation of the MassBank data into
  # flat-text string arrays (basically, into text-file style, but still in memory)
@@ -327,6 +327,13 @@ createMolfile <- function(id_or_smiles, fileName = FALSE)
 	if(is.na(babeldir))
 	{
 		res <- getCactus(smiles, "sdf")
+		
+		if(any(is.na(res))){
+			res <- getPcSDF(smiles)
+		}
+		if(any(is.na(res))){
+			stop("Pubchem and Cactus both seem to be down.")
+		}
 		if(is.character(fileName))
 			writeLines(res, fileName)
 	}
@@ -489,8 +496,12 @@ gatherData <- function(id)
 		inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE)
 	} else{
 		inchikey <- getCactus(smiles, 'stdinchikey')
-		##Split the "InChiKey=" part off the key
-		inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]]
+		if(!is.na(inchikey)){
+			##Split the "InChiKey=" part off the key
+			inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]]
+		} else{
+		    inchikey_split <- getPcInchiKey(smiles)
+		}
 	}
 	
 	##Use Pubchem to retrieve information
@@ -525,7 +536,7 @@ gatherData <- function(id)
 		warning(paste0("Compound ID ",id,": no IUPAC name could be identified."))
 	}
 	
-	names <- as.list(unique(c(dbname, synonym, iupacName)))
+	names <- as.list(unique(toupper(c(dbname, synonym, iupacName))))
 	
 	##If no name is found, it must be supplied in one way or another
 	if(all(sapply(names, function(x) x == ""))){
@@ -617,7 +628,7 @@ gatherData <- function(id)
 	}
 	# HMDB
 	if(!is.na(CTSinfo[1])){
-		if("HMDB" %in% CTS.externalIdTypes(CTSinfo))
+		if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo))
 			link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]]
 		# KEGG
 		if("KEGG" %in% CTS.externalIdTypes(CTSinfo))
@@ -632,7 +643,7 @@ gatherData <- function(id)
 			if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo))
 			{
 				pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID")
-				link[["PUBCHEM"]] <- paste0("CID:",min(pc))
+				link[["PUBCHEM"]] <- paste0(min(pc))
 			}
 		}
 	} else{
@@ -1745,26 +1756,36 @@ makeMollist <- function(compiled)
 #' @export 
 addPeaks <- function(mb, filename_or_dataframe)
 {
+	
+	errorvar <- 0
+	currEnvir <- environment()
+	d <- 1
+	
 	if(is.data.frame(filename_or_dataframe))
 		df <- filename_or_dataframe
 	else
-		df <- read.csv(filename_or_dataframe)
+	tryCatch(
+		df <- read.csv(filename_or_dataframe),
+		error=function(e){
+		currEnvir$errorvar <- 1
+	})
 	# I change your heuristic fix to another heuristic fix, because I will have to test for a column name change...
-	if(ncol(df) < 2)
-		df <- read.csv2(filename_or_dataframe, stringsAsFactors=FALSE)
-	# here: the column int was renamed to intensity, and we need to be able to read old files. sorry.
-	if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df)))
-		df$intensity <- df$int
-		
-	cols <- c("cpdID", "scan", "mzFound", "intensity", "OK")
 	
-	n <- colnames(df)
+	if(!errorvar){
 	
-	# Check if comma-separated or semicolon-separated
-	d <- setdiff(cols, n)
-	
-	if(length(d)>0){
-		stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, int, OK.")
+		if(ncol(df) < 2)
+			df <- read.csv(filename_or_dataframe, sep=";")
+		# here: the column int was renamed to intensity, and we need to be able to read old files. sorry.
+		if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df)))
+			df$intensity <- df$int
+		
+		cols <- c("cpdID", "scan", "mzFound", "intensity", "OK")
+		n <- colnames(df)
+		# Check if comma-separated or semicolon-separated
+		d <- setdiff(cols, n)
+		if(length(d)>0){
+			stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.")
+		}
 	}
 	
 	culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")]

--- a/R/leMsMs.r
+++ b/R/leMsMs.r
@@ -40,8 +40,8 @@ archiveResults <- function(w, fileName, settings = getOption("RMassBank"))
 #' workflow.
 #' 
 #' @param w A \code{msmsWorkspace} to work with.
-#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA"} for different ions 
-#' 			([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-).
+#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA", "pNH4"} for different ions 
+#' 			([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-, [M+NH4]+).
 #' @param steps Which steps of the workflow to process. See the vignette 
 #' 			\code{vignette("RMassBank")} for details.
 #' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses
@@ -79,7 +79,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec
 		progressbar = "progressBarHook", MSe = FALSE)
 {
    .checkMbSettings()
-  if(!any(mode %in% c("pH","pNa","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown."))
+  if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown."))
  
  if(!is.na(archivename))
 	  w@archivename <- archivename
@@ -1875,9 +1875,9 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE)
 	# rename (because "formulacol" is not the actually correct name)
 	colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity")
 	
-	if(!is.data.frame(peaks))
+	if(!is.data.frame(peaks) || (nrow(peaks) == 0) )
 	{
-		stop("filterPeaksMultiplicity: All peaks have been filtered.")
+		warning("filterPeaksMultiplicity: All peaks have been filtered.")
 		peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric()))
 	}
 	else

--- a/R/leMsmsRaw.R
+++ b/R/leMsmsRaw.R
@@ -139,7 +139,7 @@ findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMo
 		rtLimits <- c(dbRt$RT - rtMargin, dbRt$RT + rtMargin) * 60
 	}
 	spectra <- findMsMsHR.mass(msRaw, mz, mzCoarse, limit.fine, rtLimits, confirmMode + 1,headerCache
-			,fillPrecursorScan, deprofile, peaksCache)
+			,fillPrecursorScan, deprofile, peaksCache, cpdID)
 	# check whether a) spectrum was found and b) enough spectra were found
 	if(length(spectra) < (confirmMode + 1))
 		sp <- new("RmbSpectraSet", found=FALSE)
@@ -162,17 +162,9 @@ findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMo
 #' @export
 findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA,
 		headerCache = NULL, fillPrecursorScan = FALSE,
-		deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL)
+		deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA)
 {
 	eic <- findEIC(msRaw, mz, limit.fine, rtLimits, headerCache=headerCache, 
-			
-			
-			
-			
-			
-			
-			
-			
 			peaksCache=peaksCache)
 	#	if(!is.na(rtLimits))
 	#	{  
@@ -183,6 +175,12 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA,
 	else
 		headerData <- as.data.frame(header(msRaw))
 	
+	
+	###If no precursor scan number, fill the number
+	if(length(unique(headerData$precursorScanNum)) == 1){
+		fillPrecursorScan <- TRUE
+	}
+
 	if(fillPrecursorScan == TRUE)
 	{
 		# reset the precursor scan number. first set to NA, then
@@ -217,7 +215,10 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA,
 			})
 	validPrecursors <- validPrecursors[which(which_OK==TRUE)]
 	if(length(validPrecursors) == 0){
-		warning("No precursor was detected. It is recommended to try to use the setting fillPrecursorScan: TRUE in the ini-file")
+		if(!is.na(cpdID))
+			warning(paste0("No precursor was detected for compound, ", cpdID, " with m/z ", mz, ". Please check the mass and retention time window."))
+		else
+			warning(paste0("No precursor was detected for m/z ", mz, ". Please check the mass and retention time window."))
 	}
 	# Crop the "EIC" to the valid precursor scans
 	eic <- eic[eic$scan %in% validPrecursors,]
@@ -239,7 +240,7 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA,
 				childHeaders <- headerData[(headerData$precursorScanNum == masterScan) 
 								& (headerData$precursorMZ > mz - limit.coarse) 
 								& (headerData$precursorMZ < mz + limit.coarse) ,]
-				childScans <- childHeaders$acquisitionNum
+				childScans <- childHeaders$seqNum
 				
 				msPeaks <- mzR::peaks(msRaw, masterHeader$seqNum)
 				# if deprofile option is set: run deprofiling

--- a/R/msmsRead.R
+++ b/R/msmsRead.R
@@ -43,7 +43,7 @@
 msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, 
 					readMethod, mode, confirmMode = FALSE, useRtLimit = TRUE, 
 					Args = NULL, settings = getOption("RMassBank"), progressbar = "progressBarHook", MSe = FALSE, plots = FALSE){
-	
+	.checkMbSettings()
 	##Read the files and cpdids according to the definition
 	##All cases are silently accepted, as long as they can be handled according to one definition
 	if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown."))
@@ -339,7 +339,13 @@ msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NU
 	if(all(w@files != xRAW[[1]]@filepath)){
 		w@files <- c(w@files,xRAW[[1]]@filepath)
 	} else{
-		w@files <- c(w@files,paste0(xRAW[[1]]@filepath,"_2"))
+		for(i in 2:(length(w@files)+1)){
+			currentFPath <- paste0(xRAW[[1]]@filepath,"_",i)
+			if(all(w@files != currentFPath)){
+				w@files <- c(w@files,currentFPath)
+				break
+			}
+		}
 	}
 	names(w@specs)[length(w@specs)] <- basename(w@files[length(w@files)])
 	return(w)

--- a/R/validateMassBank.R
+++ b/R/validateMassBank.R
@@ -178,39 +178,52 @@ smiles2mass <- function(SMILES){
 .unitTestRMB <- function(WD=getwd()){
 	require(RUnit)
 	library(RMassBank)
-	library(RMassBankData)
+	require(RMassBankData)
 	oldwd <- getwd()
 	setwd(WD)
 	w <- newMsmsWorkspace()
 	RmbDefaultSettings()
 	files <- list.files(system.file("spectra", package="RMassBankData"),
 		 ".mzML", full.names = TRUE)
-	basename(files)
-	# To make the workflow faster here, we use only 2 compounds:
 	w@files <- files
 	loadList(system.file("list/NarcoticsDataset.csv", 
 		package="RMassBankData"))
 	w <- msmsWorkflow(w, mode="pH", steps=c(1:4), archivename = 
 					"pH_narcotics")
+	storedW <- loadMsmsWorkspace(system.file("results/pH_narcotics_RF.RData", 
+					package="RMassBankData"))
+	storedW <- msmsWorkflow(storedW, mode="pH", steps=4)
+	w@rc <- storedW@rc
+	w@rc.ms1 <- storedW@rc.ms1
+	w <- msmsWorkflow(w, mode="pH", steps=4, archivename = 
+	"pH_narcotics", newRecalibration = FALSE)
 	w <- msmsWorkflow(w, mode="pH", steps=c(5:8), archivename = 
 			"pH_narcotics")	
-	w2 <- newMbWorkspace(w)
-	#w2 <- mbWorkflow(w2)
-	#w2 <- loadInfolist(w2, "infolist.csv")
-	#w2 <- mbWorkflow(w2)
+	mb <- newMbWorkspace(w)
+	mb <- resetInfolists(mb)
+	mb <- loadInfolists(mb, system.file("infolists_incomplete",
+			package="RMassBankData"))
+	mb <- mbWorkflow(mb, infolist_path="./Narcotics_infolist.csv")
+	mb <- resetInfolists(mb)
+	mb <- loadInfolists(mb, system.file("infolists", package="RMassBankData"))
+	mb <- mbWorkflow(mb)
 	
 	testSuite <- defineTestSuite("Electronic noise and formula calculation Test", dirs = system.file("unitTests", 
 		package="RMassBank"), testFileRegexp = "runit.EN_FC.R",
 					#testFuncRegexp = "^test.+",
 					rngKind = "Marsaglia-Multicarry",
 					rngNormalKind = "Kinderman-Ramage")
-					
+	testSuite2 <- defineTestSuite("Evaluation of data acquisition process", dirs = system.file("unitTests", 
+		package="RMassBank"), testFileRegexp = "runit.DA.R",
+					#testFuncRegexp = "^test.+",
+					rngKind = "Marsaglia-Multicarry",
+					rngNormalKind = "Kinderman-Ramage")			
 	testData <- suppressWarnings(runTestSuite(testSuite))
+	testData2 <- suppressWarnings(runTestSuite(testSuite2))
 	
 	file.remove(c("pH_narcotics_Failpeaks.csv","pH_narcotics.RData","pH_narcotics_RA.RData","pH_narcotics_RF.RData"))
-	
-	# Prints the HTML-record
 	printTextProtocol(testData)
+	printTextProtocol(testData2)
 	setwd(oldwd)
 	return(testData)
 }
--- a/R/webAccess.R
+++ b/R/webAccess.R
@@ -313,6 +313,8 @@ CTS.externalIdTypes <- function(data)
  }
 }

+
+
 getPcCHEBI <- function(query, from = "inchikey")
 {
 	# Get the JSON-Data from Pubchem
@@ -466,3 +468,59 @@ getPcIUPAC <- function (query, from = "inchikey")
 		return(IUPACEntries[[1]]$value$sval)
 	}
 } 
+
+getPcInchiKey <- function(query, from = "smiles"){
+	# Get the JSON-Data from Pubchem
+	baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound"
+	url <- paste(baseURL, from, query, "record", "json", sep="/")
+	errorvar <- 0
+	currEnvir <- environment()
+	
+	tryCatch(
+		data <- getURL(URLencode(url),timeout=5),
+		error=function(e){
+		currEnvir$errorvar <- 1
+	})
+	
+	if(errorvar){
+		return(NA)
+	}
+	
+	r <- fromJSON(data)
+	
+	# This happens if the InChI key is not found:
+	if(!is.null(r$Fault))
+	return(NA)
+	
+	# Find the entries which contain Chebi-links
+	if(!is.null(r$PC_Compounds[[1]]$props)){
+		INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey")
+		if(length(INKEYindex) > 0){
+			return(r$PC_Compounds[[1]]$props[[INKEYindex]]$value$sval)
+		}	else{return(NA)}
+	}	else{return(NA)}
+
+	
+}
+
+getPcSDF <- function(query, from = "smiles"){
+	baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound"
+	url <- paste(baseURL, from, query, "sdf", sep="/")
+	
+	errorvar <- 0
+	currEnvir <- environment()
+	
+	tryCatch(
+		data <- getURL(URLencode(url),timeout=5),
+		error=function(e){
+		currEnvir$errorvar <- 1
+	})
+	
+	if(errorvar){
+		return(NA)
+	}
+	
+	molEnd <- regexpr(data,pattern="M  END",fixed=TRUE)+5
+	data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$")
+	return(data)
+}
--- a/inst/unitTests/runit.DA.R
+++ b/inst/unitTests/runit.DA.R
+test.mzRRead <- function(){
+	allOK <- TRUE
+	records <- list.files("XX/recdata",full.names=TRUE)
+	rightrecords <- list.files(system.file("records/XX/recdata", package="RMassBankData"), full.names=TRUE)
+	
+	for(i in 1:length(records)){
+		gen <- file(description = records[i], open = "r", blocking = TRUE,
+		encoding = getOption("encoding"), raw = FALSE)
+		genLines <- readLines(gen)
+		RMBLine <- grep("MS$DATA_PROCESSING: WHOLE",genLines, fixed=TRUE)
+        DATELine <- grep("DATE: ",genLines, fixed=TRUE)
+		genLines <- genLines[-c(RMBLine,DATELine)]
+		print(rightrecords[i])
+		rig <- file(description = rightrecords[i], open = "r", blocking = TRUE,
+		encoding = getOption("encoding"), raw = FALSE)
+		rigLines <- readLines(rig)
+		RMBLine <- grep("MS$DATA_PROCESSING: WHOLE",rigLines, fixed=TRUE)
+        DATELine <- grep("DATE: ",rigLines, fixed=TRUE)
+		rigLines <- rigLines[-c(RMBLine,DATELine)]
+		close(gen)
+		close(rig)
+		if(!identical(genLines,rigLines)){
+			allOK <- FALSE
+			break
+		}
+
+	}
+	checkTrue(allOK)
+}
\ No newline at end of file
--- a/inst/unitTests/runit.EN_FC.R
+++ b/inst/unitTests/runit.EN_FC.R
 test.eletronicnoise <- function(){
 	failpeaks <- read.csv("pH_narcotics_Failpeaks.csv")
-	
 	checkTrue(!any(apply(failpeaks,1,function(x) all(x[2:4] == c(1738,2819,668,201.69144)))))
 }

-test.formulacalc <- function(){
+test.formulacalculation <- function(){
 	failpeaks <- read.csv("pH_narcotics_Failpeaks.csv")
 	
 	checkTrue(!any(apply(failpeaks,1,function(x) all(x[2:4] == c(70,2758,321,56.04933)))))

--- a/man/findMsMsHR.Rd
+++ b/man/findMsMsHR.Rd
@@ -15,7 +15,7 @@ findMsMsHR(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE,

 		findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA,
 		headerCache = NULL, fillPrecursorScan = FALSE,
-		deprofile = getOption("RMassBank")$deprofile)
+		deprofile = getOption("RMassBank")$deprofile, cpdID)

 findMsMsHR.direct(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE,
 			ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine,