Skip to content
Snippets Groups Projects
Commit 26d217eb authored by Michele Stravs's avatar Michele Stravs
Browse files

Merge branch 'bioc2014' into refactoring and some fixes by hand

Conflicts:
	DESCRIPTION
	R/createMassBank.R
	R/leMsmsRaw.R
parents 61adc3d7 1055dc92
No related branches found
No related tags found
No related merge requests found
Package: RMassBank
Type: Package
Title: Workflow to process tandem MS files and build MassBank records
Version: 1.9.5.2
Version: 1.9.7
Authors@R: c(
person(given = "RMassBank at Eawag", email = "massbank@eawag.ch",
role=c("cre")),
......
......@@ -244,7 +244,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c
# check which compounds have useful spectra
mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0))
mb@problems <- which(is.na(mb@compiled))
mb@compiled_ok <- mb@compiled[mb@ok]
mb@compiled_ok <- mb@compiled[mb@ok]
}
# Step 5: Convert the internal tree-like representation of the MassBank data into
# flat-text string arrays (basically, into text-file style, but still in memory)
......@@ -327,6 +327,13 @@ createMolfile <- function(id_or_smiles, fileName = FALSE)
if(is.na(babeldir))
{
res <- getCactus(smiles, "sdf")
if(any(is.na(res))){
res <- getPcSDF(smiles)
}
if(any(is.na(res))){
stop("Pubchem and Cactus both seem to be down.")
}
if(is.character(fileName))
writeLines(res, fileName)
}
......@@ -489,8 +496,12 @@ gatherData <- function(id)
inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE)
} else{
inchikey <- getCactus(smiles, 'stdinchikey')
##Split the "InChiKey=" part off the key
inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]]
if(!is.na(inchikey)){
##Split the "InChiKey=" part off the key
inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]]
} else{
inchikey_split <- getPcInchiKey(smiles)
}
}
##Use Pubchem to retrieve information
......@@ -525,7 +536,7 @@ gatherData <- function(id)
warning(paste0("Compound ID ",id,": no IUPAC name could be identified."))
}
names <- as.list(unique(c(dbname, synonym, iupacName)))
names <- as.list(unique(toupper(c(dbname, synonym, iupacName))))
##If no name is found, it must be supplied in one way or another
if(all(sapply(names, function(x) x == ""))){
......@@ -617,7 +628,7 @@ gatherData <- function(id)
}
# HMDB
if(!is.na(CTSinfo[1])){
if("HMDB" %in% CTS.externalIdTypes(CTSinfo))
if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo))
link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]]
# KEGG
if("KEGG" %in% CTS.externalIdTypes(CTSinfo))
......@@ -632,7 +643,7 @@ gatherData <- function(id)
if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo))
{
pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID")
link[["PUBCHEM"]] <- paste0("CID:",min(pc))
link[["PUBCHEM"]] <- paste0(min(pc))
}
}
} else{
......@@ -1745,26 +1756,36 @@ makeMollist <- function(compiled)
#' @export
addPeaks <- function(mb, filename_or_dataframe)
{
errorvar <- 0
currEnvir <- environment()
d <- 1
if(is.data.frame(filename_or_dataframe))
df <- filename_or_dataframe
else
df <- read.csv(filename_or_dataframe)
tryCatch(
df <- read.csv(filename_or_dataframe),
error=function(e){
currEnvir$errorvar <- 1
})
# I change your heuristic fix to another heuristic fix, because I will have to test for a column name change...
if(ncol(df) < 2)
df <- read.csv2(filename_or_dataframe, stringsAsFactors=FALSE)
# here: the column int was renamed to intensity, and we need to be able to read old files. sorry.
if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df)))
df$intensity <- df$int
cols <- c("cpdID", "scan", "mzFound", "intensity", "OK")
n <- colnames(df)
if(!errorvar){
# Check if comma-separated or semicolon-separated
d <- setdiff(cols, n)
if(length(d)>0){
stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, int, OK.")
if(ncol(df) < 2)
df <- read.csv(filename_or_dataframe, sep=";")
# here: the column int was renamed to intensity, and we need to be able to read old files. sorry.
if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df)))
df$intensity <- df$int
cols <- c("cpdID", "scan", "mzFound", "intensity", "OK")
n <- colnames(df)
# Check if comma-separated or semicolon-separated
d <- setdiff(cols, n)
if(length(d)>0){
stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.")
}
}
culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")]
......
......@@ -40,8 +40,8 @@ archiveResults <- function(w, fileName, settings = getOption("RMassBank"))
#' workflow.
#'
#' @param w A \code{msmsWorkspace} to work with.
#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA"} for different ions
#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-).
#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA", "pNH4"} for different ions
#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-, [M+NH4]+).
#' @param steps Which steps of the workflow to process. See the vignette
#' \code{vignette("RMassBank")} for details.
#' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses
......@@ -79,7 +79,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec
progressbar = "progressBarHook", MSe = FALSE)
{
.checkMbSettings()
if(!any(mode %in% c("pH","pNa","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown."))
if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown."))
if(!is.na(archivename))
w@archivename <- archivename
......@@ -1875,9 +1875,9 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE)
# rename (because "formulacol" is not the actually correct name)
colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity")
if(!is.data.frame(peaks))
if(!is.data.frame(peaks) || (nrow(peaks) == 0) )
{
stop("filterPeaksMultiplicity: All peaks have been filtered.")
warning("filterPeaksMultiplicity: All peaks have been filtered.")
peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric()))
}
else
......
......@@ -139,7 +139,7 @@ findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMo
rtLimits <- c(dbRt$RT - rtMargin, dbRt$RT + rtMargin) * 60
}
spectra <- findMsMsHR.mass(msRaw, mz, mzCoarse, limit.fine, rtLimits, confirmMode + 1,headerCache
,fillPrecursorScan, deprofile, peaksCache)
,fillPrecursorScan, deprofile, peaksCache, cpdID)
# check whether a) spectrum was found and b) enough spectra were found
if(length(spectra) < (confirmMode + 1))
sp <- new("RmbSpectraSet", found=FALSE)
......@@ -162,17 +162,9 @@ findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMo
#' @export
findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA,
headerCache = NULL, fillPrecursorScan = FALSE,
deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL)
deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA)
{
eic <- findEIC(msRaw, mz, limit.fine, rtLimits, headerCache=headerCache,
peaksCache=peaksCache)
# if(!is.na(rtLimits))
# {
......@@ -183,6 +175,12 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA,
else
headerData <- as.data.frame(header(msRaw))
###If no precursor scan number, fill the number
if(length(unique(headerData$precursorScanNum)) == 1){
fillPrecursorScan <- TRUE
}
if(fillPrecursorScan == TRUE)
{
# reset the precursor scan number. first set to NA, then
......@@ -217,7 +215,10 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA,
})
validPrecursors <- validPrecursors[which(which_OK==TRUE)]
if(length(validPrecursors) == 0){
warning("No precursor was detected. It is recommended to try to use the setting fillPrecursorScan: TRUE in the ini-file")
if(!is.na(cpdID))
warning(paste0("No precursor was detected for compound, ", cpdID, " with m/z ", mz, ". Please check the mass and retention time window."))
else
warning(paste0("No precursor was detected for m/z ", mz, ". Please check the mass and retention time window."))
}
# Crop the "EIC" to the valid precursor scans
eic <- eic[eic$scan %in% validPrecursors,]
......@@ -239,7 +240,7 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA,
childHeaders <- headerData[(headerData$precursorScanNum == masterScan)
& (headerData$precursorMZ > mz - limit.coarse)
& (headerData$precursorMZ < mz + limit.coarse) ,]
childScans <- childHeaders$acquisitionNum
childScans <- childHeaders$seqNum
msPeaks <- mzR::peaks(msRaw, masterHeader$seqNum)
# if deprofile option is set: run deprofiling
......
......@@ -43,7 +43,7 @@
msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL,
readMethod, mode, confirmMode = FALSE, useRtLimit = TRUE,
Args = NULL, settings = getOption("RMassBank"), progressbar = "progressBarHook", MSe = FALSE, plots = FALSE){
.checkMbSettings()
##Read the files and cpdids according to the definition
##All cases are silently accepted, as long as they can be handled according to one definition
if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown."))
......@@ -339,7 +339,13 @@ msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NU
if(all(w@files != xRAW[[1]]@filepath)){
w@files <- c(w@files,xRAW[[1]]@filepath)
} else{
w@files <- c(w@files,paste0(xRAW[[1]]@filepath,"_2"))
for(i in 2:(length(w@files)+1)){
currentFPath <- paste0(xRAW[[1]]@filepath,"_",i)
if(all(w@files != currentFPath)){
w@files <- c(w@files,currentFPath)
break
}
}
}
names(w@specs)[length(w@specs)] <- basename(w@files[length(w@files)])
return(w)
......
......@@ -178,39 +178,52 @@ smiles2mass <- function(SMILES){
.unitTestRMB <- function(WD=getwd()){
require(RUnit)
library(RMassBank)
library(RMassBankData)
require(RMassBankData)
oldwd <- getwd()
setwd(WD)
w <- newMsmsWorkspace()
RmbDefaultSettings()
files <- list.files(system.file("spectra", package="RMassBankData"),
".mzML", full.names = TRUE)
basename(files)
# To make the workflow faster here, we use only 2 compounds:
w@files <- files
loadList(system.file("list/NarcoticsDataset.csv",
package="RMassBankData"))
w <- msmsWorkflow(w, mode="pH", steps=c(1:4), archivename =
"pH_narcotics")
storedW <- loadMsmsWorkspace(system.file("results/pH_narcotics_RF.RData",
package="RMassBankData"))
storedW <- msmsWorkflow(storedW, mode="pH", steps=4)
w@rc <- storedW@rc
w@rc.ms1 <- storedW@rc.ms1
w <- msmsWorkflow(w, mode="pH", steps=4, archivename =
"pH_narcotics", newRecalibration = FALSE)
w <- msmsWorkflow(w, mode="pH", steps=c(5:8), archivename =
"pH_narcotics")
w2 <- newMbWorkspace(w)
#w2 <- mbWorkflow(w2)
#w2 <- loadInfolist(w2, "infolist.csv")
#w2 <- mbWorkflow(w2)
mb <- newMbWorkspace(w)
mb <- resetInfolists(mb)
mb <- loadInfolists(mb, system.file("infolists_incomplete",
package="RMassBankData"))
mb <- mbWorkflow(mb, infolist_path="./Narcotics_infolist.csv")
mb <- resetInfolists(mb)
mb <- loadInfolists(mb, system.file("infolists", package="RMassBankData"))
mb <- mbWorkflow(mb)
testSuite <- defineTestSuite("Electronic noise and formula calculation Test", dirs = system.file("unitTests",
package="RMassBank"), testFileRegexp = "runit.EN_FC.R",
#testFuncRegexp = "^test.+",
rngKind = "Marsaglia-Multicarry",
rngNormalKind = "Kinderman-Ramage")
testSuite2 <- defineTestSuite("Evaluation of data acquisition process", dirs = system.file("unitTests",
package="RMassBank"), testFileRegexp = "runit.DA.R",
#testFuncRegexp = "^test.+",
rngKind = "Marsaglia-Multicarry",
rngNormalKind = "Kinderman-Ramage")
testData <- suppressWarnings(runTestSuite(testSuite))
testData2 <- suppressWarnings(runTestSuite(testSuite2))
file.remove(c("pH_narcotics_Failpeaks.csv","pH_narcotics.RData","pH_narcotics_RA.RData","pH_narcotics_RF.RData"))
# Prints the HTML-record
printTextProtocol(testData)
printTextProtocol(testData2)
setwd(oldwd)
return(testData)
}
......@@ -313,6 +313,8 @@ CTS.externalIdTypes <- function(data)
}
}
getPcCHEBI <- function(query, from = "inchikey")
{
# Get the JSON-Data from Pubchem
......@@ -466,3 +468,59 @@ getPcIUPAC <- function (query, from = "inchikey")
return(IUPACEntries[[1]]$value$sval)
}
}
getPcInchiKey <- function(query, from = "smiles"){
# Get the JSON-Data from Pubchem
baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound"
url <- paste(baseURL, from, query, "record", "json", sep="/")
errorvar <- 0
currEnvir <- environment()
tryCatch(
data <- getURL(URLencode(url),timeout=5),
error=function(e){
currEnvir$errorvar <- 1
})
if(errorvar){
return(NA)
}
r <- fromJSON(data)
# This happens if the InChI key is not found:
if(!is.null(r$Fault))
return(NA)
# Find the entries which contain Chebi-links
if(!is.null(r$PC_Compounds[[1]]$props)){
INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey")
if(length(INKEYindex) > 0){
return(r$PC_Compounds[[1]]$props[[INKEYindex]]$value$sval)
} else{return(NA)}
} else{return(NA)}
}
getPcSDF <- function(query, from = "smiles"){
baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound"
url <- paste(baseURL, from, query, "sdf", sep="/")
errorvar <- 0
currEnvir <- environment()
tryCatch(
data <- getURL(URLencode(url),timeout=5),
error=function(e){
currEnvir$errorvar <- 1
})
if(errorvar){
return(NA)
}
molEnd <- regexpr(data,pattern="M END",fixed=TRUE)+5
data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$")
return(data)
}
test.mzRRead <- function(){
allOK <- TRUE
records <- list.files("XX/recdata",full.names=TRUE)
rightrecords <- list.files(system.file("records/XX/recdata", package="RMassBankData"), full.names=TRUE)
for(i in 1:length(records)){
gen <- file(description = records[i], open = "r", blocking = TRUE,
encoding = getOption("encoding"), raw = FALSE)
genLines <- readLines(gen)
RMBLine <- grep("MS$DATA_PROCESSING: WHOLE",genLines, fixed=TRUE)
DATELine <- grep("DATE: ",genLines, fixed=TRUE)
genLines <- genLines[-c(RMBLine,DATELine)]
print(rightrecords[i])
rig <- file(description = rightrecords[i], open = "r", blocking = TRUE,
encoding = getOption("encoding"), raw = FALSE)
rigLines <- readLines(rig)
RMBLine <- grep("MS$DATA_PROCESSING: WHOLE",rigLines, fixed=TRUE)
DATELine <- grep("DATE: ",rigLines, fixed=TRUE)
rigLines <- rigLines[-c(RMBLine,DATELine)]
close(gen)
close(rig)
if(!identical(genLines,rigLines)){
allOK <- FALSE
break
}
}
checkTrue(allOK)
}
\ No newline at end of file
test.eletronicnoise <- function(){
failpeaks <- read.csv("pH_narcotics_Failpeaks.csv")
checkTrue(!any(apply(failpeaks,1,function(x) all(x[2:4] == c(1738,2819,668,201.69144)))))
}
test.formulacalc <- function(){
test.formulacalculation <- function(){
failpeaks <- read.csv("pH_narcotics_Failpeaks.csv")
checkTrue(!any(apply(failpeaks,1,function(x) all(x[2:4] == c(70,2758,321,56.04933)))))
......
......@@ -15,7 +15,7 @@ findMsMsHR(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE,
findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA,
headerCache = NULL, fillPrecursorScan = FALSE,
deprofile = getOption("RMassBank")$deprofile)
deprofile = getOption("RMassBank")$deprofile, cpdID)
findMsMsHR.direct(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE,
ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment