diff --git a/DESCRIPTION b/DESCRIPTION index 342da0989857ca8fc8f3ba413192e6f1572ef308..e2bc86d1874b0ea97aae8d1bc3c7c69e881cf829 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 1.9.5.2 +Version: 1.9.7 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), diff --git a/R/createMassBank.R b/R/createMassBank.R index a78cdb86e96309b42588bf091c486219196cc0ed..78faaa5e74b3d41fda17f8c9690d33d880f31f12 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -244,7 +244,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c # check which compounds have useful spectra mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) mb@problems <- which(is.na(mb@compiled)) - mb@compiled_ok <- mb@compiled[mb@ok] + mb@compiled_ok <- mb@compiled[mb@ok] } # Step 5: Convert the internal tree-like representation of the MassBank data into # flat-text string arrays (basically, into text-file style, but still in memory) @@ -327,6 +327,13 @@ createMolfile <- function(id_or_smiles, fileName = FALSE) if(is.na(babeldir)) { res <- getCactus(smiles, "sdf") + + if(any(is.na(res))){ + res <- getPcSDF(smiles) + } + if(any(is.na(res))){ + stop("Pubchem and Cactus both seem to be down.") + } if(is.character(fileName)) writeLines(res, fileName) } @@ -489,8 +496,12 @@ gatherData <- function(id) inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) } else{ inchikey <- getCactus(smiles, 'stdinchikey') - ##Split the "InChiKey=" part off the key - inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]] + if(!is.na(inchikey)){ + ##Split the "InChiKey=" part off the key + inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]] + } else{ + inchikey_split <- getPcInchiKey(smiles) + } } ##Use Pubchem to retrieve information @@ -525,7 +536,7 @@ gatherData <- function(id) warning(paste0("Compound ID ",id,": no IUPAC name could be identified.")) } - names <- as.list(unique(c(dbname, synonym, iupacName))) + names <- as.list(unique(toupper(c(dbname, synonym, iupacName)))) ##If no name is found, it must be supplied in one way or another if(all(sapply(names, function(x) x == ""))){ @@ -617,7 +628,7 @@ gatherData <- function(id) } # HMDB if(!is.na(CTSinfo[1])){ - if("HMDB" %in% CTS.externalIdTypes(CTSinfo)) + if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo)) link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]] # KEGG if("KEGG" %in% CTS.externalIdTypes(CTSinfo)) @@ -632,7 +643,7 @@ gatherData <- function(id) if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo)) { pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID") - link[["PUBCHEM"]] <- paste0("CID:",min(pc)) + link[["PUBCHEM"]] <- paste0(min(pc)) } } } else{ @@ -1745,26 +1756,36 @@ makeMollist <- function(compiled) #' @export addPeaks <- function(mb, filename_or_dataframe) { + + errorvar <- 0 + currEnvir <- environment() + d <- 1 + if(is.data.frame(filename_or_dataframe)) df <- filename_or_dataframe else - df <- read.csv(filename_or_dataframe) + tryCatch( + df <- read.csv(filename_or_dataframe), + error=function(e){ + currEnvir$errorvar <- 1 + }) # I change your heuristic fix to another heuristic fix, because I will have to test for a column name change... - if(ncol(df) < 2) - df <- read.csv2(filename_or_dataframe, stringsAsFactors=FALSE) - # here: the column int was renamed to intensity, and we need to be able to read old files. sorry. - if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df))) - df$intensity <- df$int - - cols <- c("cpdID", "scan", "mzFound", "intensity", "OK") - n <- colnames(df) + if(!errorvar){ - # Check if comma-separated or semicolon-separated - d <- setdiff(cols, n) - - if(length(d)>0){ - stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, int, OK.") + if(ncol(df) < 2) + df <- read.csv(filename_or_dataframe, sep=";") + # here: the column int was renamed to intensity, and we need to be able to read old files. sorry. + if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df))) + df$intensity <- df$int + + cols <- c("cpdID", "scan", "mzFound", "intensity", "OK") + n <- colnames(df) + # Check if comma-separated or semicolon-separated + d <- setdiff(cols, n) + if(length(d)>0){ + stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.") + } } culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")] diff --git a/R/leMsMs.r b/R/leMsMs.r index 7bb8b0b3b37423add47a317c15016ab93f208cc7..8d69c82b411f71a61d2ac4c471711121e406b4ef 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -40,8 +40,8 @@ archiveResults <- function(w, fileName, settings = getOption("RMassBank")) #' workflow. #' #' @param w A \code{msmsWorkspace} to work with. -#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-). +#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA", "pNH4"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-, [M+NH4]+). #' @param steps Which steps of the workflow to process. See the vignette #' \code{vignette("RMassBank")} for details. #' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses @@ -79,7 +79,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec progressbar = "progressBarHook", MSe = FALSE) { .checkMbSettings() - if(!any(mode %in% c("pH","pNa","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) + if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) if(!is.na(archivename)) w@archivename <- archivename @@ -1875,9 +1875,9 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) # rename (because "formulacol" is not the actually correct name) colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity") - if(!is.data.frame(peaks)) + if(!is.data.frame(peaks) || (nrow(peaks) == 0) ) { - stop("filterPeaksMultiplicity: All peaks have been filtered.") + warning("filterPeaksMultiplicity: All peaks have been filtered.") peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric())) } else diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 0e0faa0554e75dc6e2ce67a6f06334ca9757e34a..66018dc109e5b94cf509e7c04f1a44adbf6403ea 100755 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -139,7 +139,7 @@ findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMo rtLimits <- c(dbRt$RT - rtMargin, dbRt$RT + rtMargin) * 60 } spectra <- findMsMsHR.mass(msRaw, mz, mzCoarse, limit.fine, rtLimits, confirmMode + 1,headerCache - ,fillPrecursorScan, deprofile, peaksCache) + ,fillPrecursorScan, deprofile, peaksCache, cpdID) # check whether a) spectrum was found and b) enough spectra were found if(length(spectra) < (confirmMode + 1)) sp <- new("RmbSpectraSet", found=FALSE) @@ -162,17 +162,9 @@ findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMo #' @export findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, headerCache = NULL, fillPrecursorScan = FALSE, - deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL) + deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA) { eic <- findEIC(msRaw, mz, limit.fine, rtLimits, headerCache=headerCache, - - - - - - - - peaksCache=peaksCache) # if(!is.na(rtLimits)) # { @@ -183,6 +175,12 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, else headerData <- as.data.frame(header(msRaw)) + + ###If no precursor scan number, fill the number + if(length(unique(headerData$precursorScanNum)) == 1){ + fillPrecursorScan <- TRUE + } + if(fillPrecursorScan == TRUE) { # reset the precursor scan number. first set to NA, then @@ -217,7 +215,10 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, }) validPrecursors <- validPrecursors[which(which_OK==TRUE)] if(length(validPrecursors) == 0){ - warning("No precursor was detected. It is recommended to try to use the setting fillPrecursorScan: TRUE in the ini-file") + if(!is.na(cpdID)) + warning(paste0("No precursor was detected for compound, ", cpdID, " with m/z ", mz, ". Please check the mass and retention time window.")) + else + warning(paste0("No precursor was detected for m/z ", mz, ". Please check the mass and retention time window.")) } # Crop the "EIC" to the valid precursor scans eic <- eic[eic$scan %in% validPrecursors,] @@ -239,7 +240,7 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, childHeaders <- headerData[(headerData$precursorScanNum == masterScan) & (headerData$precursorMZ > mz - limit.coarse) & (headerData$precursorMZ < mz + limit.coarse) ,] - childScans <- childHeaders$acquisitionNum + childScans <- childHeaders$seqNum msPeaks <- mzR::peaks(msRaw, masterHeader$seqNum) # if deprofile option is set: run deprofiling diff --git a/R/msmsRead.R b/R/msmsRead.R index 2933204166f33ca673696f91f6a266249a77e17f..364de188fef178239f02d9962a6e4fc6683267f2 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -43,7 +43,7 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, readMethod, mode, confirmMode = FALSE, useRtLimit = TRUE, Args = NULL, settings = getOption("RMassBank"), progressbar = "progressBarHook", MSe = FALSE, plots = FALSE){ - + .checkMbSettings() ##Read the files and cpdids according to the definition ##All cases are silently accepted, as long as they can be handled according to one definition if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) @@ -339,7 +339,13 @@ msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NU if(all(w@files != xRAW[[1]]@filepath)){ w@files <- c(w@files,xRAW[[1]]@filepath) } else{ - w@files <- c(w@files,paste0(xRAW[[1]]@filepath,"_2")) + for(i in 2:(length(w@files)+1)){ + currentFPath <- paste0(xRAW[[1]]@filepath,"_",i) + if(all(w@files != currentFPath)){ + w@files <- c(w@files,currentFPath) + break + } + } } names(w@specs)[length(w@specs)] <- basename(w@files[length(w@files)]) return(w) diff --git a/R/validateMassBank.R b/R/validateMassBank.R index c9857b0cc4fa18908b32c832add8071e3498c718..8ce66904f6c134c564988bf7bd21c1910a329737 100644 --- a/R/validateMassBank.R +++ b/R/validateMassBank.R @@ -178,39 +178,52 @@ smiles2mass <- function(SMILES){ .unitTestRMB <- function(WD=getwd()){ require(RUnit) library(RMassBank) - library(RMassBankData) + require(RMassBankData) oldwd <- getwd() setwd(WD) w <- newMsmsWorkspace() RmbDefaultSettings() files <- list.files(system.file("spectra", package="RMassBankData"), ".mzML", full.names = TRUE) - basename(files) - # To make the workflow faster here, we use only 2 compounds: w@files <- files loadList(system.file("list/NarcoticsDataset.csv", package="RMassBankData")) w <- msmsWorkflow(w, mode="pH", steps=c(1:4), archivename = "pH_narcotics") + storedW <- loadMsmsWorkspace(system.file("results/pH_narcotics_RF.RData", + package="RMassBankData")) + storedW <- msmsWorkflow(storedW, mode="pH", steps=4) + w@rc <- storedW@rc + w@rc.ms1 <- storedW@rc.ms1 + w <- msmsWorkflow(w, mode="pH", steps=4, archivename = + "pH_narcotics", newRecalibration = FALSE) w <- msmsWorkflow(w, mode="pH", steps=c(5:8), archivename = "pH_narcotics") - w2 <- newMbWorkspace(w) - #w2 <- mbWorkflow(w2) - #w2 <- loadInfolist(w2, "infolist.csv") - #w2 <- mbWorkflow(w2) + mb <- newMbWorkspace(w) + mb <- resetInfolists(mb) + mb <- loadInfolists(mb, system.file("infolists_incomplete", + package="RMassBankData")) + mb <- mbWorkflow(mb, infolist_path="./Narcotics_infolist.csv") + mb <- resetInfolists(mb) + mb <- loadInfolists(mb, system.file("infolists", package="RMassBankData")) + mb <- mbWorkflow(mb) testSuite <- defineTestSuite("Electronic noise and formula calculation Test", dirs = system.file("unitTests", package="RMassBank"), testFileRegexp = "runit.EN_FC.R", #testFuncRegexp = "^test.+", rngKind = "Marsaglia-Multicarry", rngNormalKind = "Kinderman-Ramage") - + testSuite2 <- defineTestSuite("Evaluation of data acquisition process", dirs = system.file("unitTests", + package="RMassBank"), testFileRegexp = "runit.DA.R", + #testFuncRegexp = "^test.+", + rngKind = "Marsaglia-Multicarry", + rngNormalKind = "Kinderman-Ramage") testData <- suppressWarnings(runTestSuite(testSuite)) + testData2 <- suppressWarnings(runTestSuite(testSuite2)) file.remove(c("pH_narcotics_Failpeaks.csv","pH_narcotics.RData","pH_narcotics_RA.RData","pH_narcotics_RF.RData")) - - # Prints the HTML-record printTextProtocol(testData) + printTextProtocol(testData2) setwd(oldwd) return(testData) } diff --git a/R/webAccess.R b/R/webAccess.R index bf26e8dbe2ad4cd4216723992e86c12e5c3ac401..53be3d19dfd35a9f527240bacc7858f4471cdc90 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -313,6 +313,8 @@ CTS.externalIdTypes <- function(data) } } + + getPcCHEBI <- function(query, from = "inchikey") { # Get the JSON-Data from Pubchem @@ -466,3 +468,59 @@ getPcIUPAC <- function (query, from = "inchikey") return(IUPACEntries[[1]]$value$sval) } } + +getPcInchiKey <- function(query, from = "smiles"){ + # Get the JSON-Data from Pubchem + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "record", "json", sep="/") + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the entries which contain Chebi-links + if(!is.null(r$PC_Compounds[[1]]$props)){ + INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey") + if(length(INKEYindex) > 0){ + return(r$PC_Compounds[[1]]$props[[INKEYindex]]$value$sval) + } else{return(NA)} + } else{return(NA)} + + +} + +getPcSDF <- function(query, from = "smiles"){ + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "sdf", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + molEnd <- regexpr(data,pattern="M END",fixed=TRUE)+5 + data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") + return(data) +} diff --git a/inst/unitTests/runit.DA.R b/inst/unitTests/runit.DA.R new file mode 100644 index 0000000000000000000000000000000000000000..f8f522640becce76cf3e1568b015eaec45b6555a --- /dev/null +++ b/inst/unitTests/runit.DA.R @@ -0,0 +1,29 @@ +test.mzRRead <- function(){ + allOK <- TRUE + records <- list.files("XX/recdata",full.names=TRUE) + rightrecords <- list.files(system.file("records/XX/recdata", package="RMassBankData"), full.names=TRUE) + + for(i in 1:length(records)){ + gen <- file(description = records[i], open = "r", blocking = TRUE, + encoding = getOption("encoding"), raw = FALSE) + genLines <- readLines(gen) + RMBLine <- grep("MS$DATA_PROCESSING: WHOLE",genLines, fixed=TRUE) + DATELine <- grep("DATE: ",genLines, fixed=TRUE) + genLines <- genLines[-c(RMBLine,DATELine)] + print(rightrecords[i]) + rig <- file(description = rightrecords[i], open = "r", blocking = TRUE, + encoding = getOption("encoding"), raw = FALSE) + rigLines <- readLines(rig) + RMBLine <- grep("MS$DATA_PROCESSING: WHOLE",rigLines, fixed=TRUE) + DATELine <- grep("DATE: ",rigLines, fixed=TRUE) + rigLines <- rigLines[-c(RMBLine,DATELine)] + close(gen) + close(rig) + if(!identical(genLines,rigLines)){ + allOK <- FALSE + break + } + + } + checkTrue(allOK) +} \ No newline at end of file diff --git a/inst/unitTests/runit.EN_FC.R b/inst/unitTests/runit.EN_FC.R index 1b6e5e43e960ca1e7a7440ad7ae1e9cf4c2f0718..9b58b982349b35fb63c4d78fa9339d7fc034d5a3 100644 --- a/inst/unitTests/runit.EN_FC.R +++ b/inst/unitTests/runit.EN_FC.R @@ -1,10 +1,9 @@ test.eletronicnoise <- function(){ failpeaks <- read.csv("pH_narcotics_Failpeaks.csv") - checkTrue(!any(apply(failpeaks,1,function(x) all(x[2:4] == c(1738,2819,668,201.69144))))) } -test.formulacalc <- function(){ +test.formulacalculation <- function(){ failpeaks <- read.csv("pH_narcotics_Failpeaks.csv") checkTrue(!any(apply(failpeaks,1,function(x) all(x[2:4] == c(70,2758,321,56.04933))))) diff --git a/man/findMsMsHR.Rd b/man/findMsMsHR.Rd index ba78dc2fe8e2136709bb474539cce13505211be3..8bc8f43ce7087ee93509d66de7627978e73f21f8 100755 --- a/man/findMsMsHR.Rd +++ b/man/findMsMsHR.Rd @@ -15,7 +15,7 @@ findMsMsHR(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, headerCache = NULL, fillPrecursorScan = FALSE, - deprofile = getOption("RMassBank")$deprofile) + deprofile = getOption("RMassBank")$deprofile, cpdID) findMsMsHR.direct(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE, ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine,