diff --git a/R/createMassBank.R b/R/createMassBank.R index d2aa3a5b77a432b04e776df545868fe1cedb4fcb..020230102957b70de9d8b95179ebb82b6737a754 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -160,6 +160,8 @@ resetInfolists <- function(mb) #' @param infolist_path A path where to store newly downloaded compound informations, #' which should then be manually inspected. #' @param mb The \code{mbWorkspace} to work in. +#' @param useCTS A boolean variable denoting whether to retrieve information using CTS or to use babel +#' to retrieve minimal information. #' @return The processed \code{mbWorkspace}. #' @seealso \code{\link{mbWorkspace-class}} #' @author Michael A. Stravs, Eawag <michael.stravs@@eawag.ch> @@ -170,22 +172,28 @@ resetInfolists <- function(mb) #' #' } #' @export -mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv") +mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv", useCTS = TRUE) { # Step 1: Find which compounds don't have annotation information yet. For these # compounds, pull information from CTS (using gatherData). if(1 %in% steps) { mbdata_ids <- lapply(mb@aggregatedRcSpecs$specFound, function(spec) spec$id) - - message("mbWorkflow: Step 1. Gather info from CTS") - + if(useCTS){ + message("mbWorkflow: Step 1. Gather info from CTS") + } else{ + message("mbWorkflow: Step 1. Gather info using babel") + } # Which IDs are not in mbdata_archive yet? new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) mb@mbdata <- lapply(new_ids, function(id) { #print(id) - d <- gatherData(id) + if(useCTS){ + d <- gatherData(id) + } else{ + d <- gatherDataBabel(id) + } #print(d$dataused) message(paste(id, ": ", d$dataused, sep='')) return(d) @@ -581,7 +589,107 @@ gatherData <- function(id) return(mbdata) } # function gather.mbdata - +# Retrieve annotation data for a compound, using only babel +#' Retrieve annotation data +#' +#' Retrieves annotation data for a compound by using babel, +#' based on the SMILES code and name of the compounds stored in the +#' compound list. +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no.. +#' The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list. +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' This function is an alternative to gatherData, in case CTS is down or if information +#' on one or more of the compounds in the compound list are sparse +#' +#' @usage gatherDataBabel(id) +#' @param id The compound ID. +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherDataBabel(131)} +#' +#' @export +gatherDataBabel <- function(id){ + #.checkMbSettings() + babeldir <- getOption("RMassBank")$babeldir + smiles <- findSmiles(id) + + + # if no babeldir was set, throw an error that says that either CTS or babel have to be used + if(is.na(babeldir)) + { + stop("No babeldir supplied; It is currently not possible to convert the information without either babel or CTS") + } else { + ###Babel conversion + cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') + inchikey <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) + cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') + inchi <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) + + ##Read from Compoundlist + smiles <- findSmiles(id) + mass <- findMass(smiles) + dbcas <- findCAS(id) + dbname <- findName(id) + if(is.na(dbname)) dbname <- "" + if(is.na(dbcas)) dbcas <- "" + formula <- findFormula(id) + + ##Create + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "smiles" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + mbdata[["COMMENT"]][["ID"]] <- id + # here compound info starts + mbdata[['CH$NAME']] <- as.list(dbname) + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- smiles + mbdata[['CH$IUPAC']] <- inchi + + link <- list() + if(dbcas != "") + link[["CAS"]] <- dbcas + link[["INCHIKEY"]] <- inchikey + mbdata[['CH$LINK']] <- link + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + } + return(mbdata) +} # Flatten the internal tree-like representation of MassBank data to a flat table. diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index e1953debea46b7069928b5dd52c5c058c178d867..98a0ba1d77b768317758afd5970af0ec06207000 100755 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -719,30 +719,43 @@ c.msmsWSspecs <- function(w1 = NA, w2 = NA){ cpdIDsw1 <- sapply(w1@specs, function(x) x$id) cpdIDsw2 <- sapply(w2@specs, function(x) x$id) + if(length(cpdIDsw2) == 0){ + stop("w2 can't be empty.") + } + + if(length(cpdIDsw1) == 0){ + w1@specs <- w2@specs + w1@files <- w2@files + return(w1) + } + for(i in 1:length(cpdIDsw2)){ if(any(cpdIDsw2[i] == cpdIDsw1)){ - index <- which(cpdIDsw2[i] == cpdIDsw1) - w1@specs[[index]]$peaks <- c(w1@specs[[index]]$peaks,w2@specs[[i]]$peaks) - w1@specs[[index]]$childScans <- c(w1@specs[[index]]$childScans,w2@specs[[i]]$childScans) - w1@specs[[index]]$childHeaders <- rbind(w1@specs[[index]]$childHeaders, w2@specs[[i]]$childHeaders) - - ##Fake seqNums and/or acquisitionNums if the concatenation has doubled some of them - - seqNums <- w1@specs[[index]]$childHeaders[,"seqNum"] - if(length(seqNums) != length(unique(seqNums))){ - w1@specs[[index]]$childHeaders[,"seqNum"] <- 2:(nrow(w1@specs[[index]]$childHeaders) + 1) - w1@specs[[index]]$childScans <- 2:(nrow(w1@specs[[index]]$childHeaders) + 1) - } + index <- which(cpdIDsw2[i] == cpdIDsw1) - acqNums <- w1@specs[[index]]$childHeaders[,"acquisitionNum"] - if(length(acqNums) != length(unique(acqNums))){ - w1@specs[[index]]$childHeaders[,"acquisitionNum"] <- 2:(nrow(w1@specs[[index]]$childHeaders) + 1) - } + w1@specs[[index]]$peaks <- c(w1@specs[[index]]$peaks,w2@specs[[i]]$peaks) + w1@specs[[index]]$childScans <- c(w1@specs[[index]]$childScans,w2@specs[[i]]$childScans) + w1@specs[[index]]$childHeaders <- rbind(w1@specs[[index]]$childHeaders, w2@specs[[i]]$childHeaders) + + ##Fake seqNums and/or acquisitionNums if the concatenation has doubled some of them + + seqNums <- w1@specs[[index]]$childHeaders[,"seqNum"] + if(length(seqNums) != length(unique(seqNums))){ + w1@specs[[index]]$childHeaders[,"seqNum"] <- 2:(nrow(w1@specs[[index]]$childHeaders) + 1) + w1@specs[[index]]$childScans <- 2:(nrow(w1@specs[[index]]$childHeaders) + 1) + } + + + acqNums <- w1@specs[[index]]$childHeaders[,"acquisitionNum"] + if(length(acqNums) != length(unique(acqNums))){ + w1@specs[[index]]$childHeaders[,"acquisitionNum"] <- 2:(nrow(w1@specs[[index]]$childHeaders) + 1) + } } if(all(cpdIDsw2[i] != cpdIDsw1)){ w1@specs[[length(w1@specs) + 1]] <- w2@specs[[i]] + w1@files <- c(w1@files,w2@files[i]) } } return(w1) diff --git a/R/msmsRead.R b/R/msmsRead.R index 42516f55c8d4ae4ceaae542883c633ffdddf151a..e8b9ac910d9e145f921d85d0e9f12f944d17ead4 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -303,7 +303,7 @@ msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NU ## 3rd Best: find pspec closest to RT from spreadsheet ##psp <- which.min( abs(getRT(anmsms) - RT) ) if((plots == TRUE) && (length(psp[[i]]) > 0)){ - plotPsSpectrum(anmsms[[i]], psp[[i]], log=TRUE, mzrange=c(0, findMz(cpdids)[[3]]), maxlabel=10) + plotPsSpectrum(anmsms[[i]], psp[[i]], log=TRUE, mzrange=c(0, findMz(cpdids[1])[[3]]), maxlabel=10) } if(length(psp[[i]]) != 0){ spectra[[i]] <- getpspectra(anmsms[[i]], psp[[i]]) @@ -315,12 +315,18 @@ msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NU spectra[[i]] <- matrix(0,2,7) } } + if(length(w@specs) != 0){ w@specs <- c(w@specs,list(toRMB(spectra,cpdids[1],mode))) } else { w@specs[[1]] <- toRMB(spectra,cpdids[1],mode) } - w@files <- c(w@files,xRAW[[1]]@filepath) - names(w@specs)[length(w@specs)] <- basename(xRAW[[1]]@filepath) + + if(all(w@files != xRAW[[1]]@filepath)){ + w@files <- c(w@files,xRAW[[1]]@filepath) + } else{ + w@files <- c(w@files,paste0(xRAW[[1]]@filepath,"_2")) + } + names(w@specs)[length(w@specs)] <- basename(w@files[length(w@files)]) return(w) } \ No newline at end of file diff --git a/R/validateMassBank.R b/R/validateMassBank.R index 21a927f74f1955db1247fb2e86ea13471e4afc09..0085e1e3cc5f50f4a6ee9daab0eb6201405a2df6 100644 --- a/R/validateMassBank.R +++ b/R/validateMassBank.R @@ -9,7 +9,7 @@ #' "report.html" in the working directory. #' #' @aliases validate -#' @usage validate(path) +#' @usage validate(path, simple = TRUE) #' @param path The filepath to a single record, or a directory to search recursively #' @param simple If TRUE the function creates a simpler form of the RUnit .html report, better readable for humans. If FALSE it returns the unchanged RUnit report. #' @examples diff --git a/man/gatherDataBabel.Rd b/man/gatherDataBabel.Rd new file mode 100644 index 0000000000000000000000000000000000000000..4727d5b58ed78f4ea03cae15af63ea890dc90536 --- /dev/null +++ b/man/gatherDataBabel.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/createMassBank.R +\name{gatherDataBabel} +\alias{gatherDataBabel} +\title{Retrieve annotation data} +\usage{ +gatherDataBabel(id) +} +\arguments{ +\item{id}{The compound ID.} +} +\value{ +Returns a list of type \code{list(id= \var{compoundID}, ..., +'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +} +\description{ +Retrieves annotation data for a compound by using babel, +based on the SMILES code and name of the compounds stored in the +compound list. +} +\details{ +Composes the "upper part" of a MassBank record filled with chemical data +about the compound: name, exact mass, structure, CAS no.. +The instrument type is also written into this block (even +if not strictly part of the chemical information). Additionally, index +fields are added at the start of the record, which will be removed later: +\code{id, dbcas, dbname} from the compound list. + +Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +inserted empty and will be filled later on. + +This function is an alternative to gatherData, in case CTS is down or if information +on one or more of the compounds in the compound list are sparse +} +\examples{ +# Gather data for compound ID 131 +\dontrun{gatherDataBabel(131)} +} +\author{ +Michael Stravs, Erik Mueller +} +\references{ +MassBank record format: +\url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +} +\seealso{ +\code{\link{mbWorkflow}} +} + diff --git a/man/mbWorkflow.Rd b/man/mbWorkflow.Rd index 275ef1d8065e13d30a671792771f81e4acc485bd..dc7b756da95c2507e96d08568f332bd41ebf3e47 100755 --- a/man/mbWorkflow.Rd +++ b/man/mbWorkflow.Rd @@ -3,7 +3,7 @@ \title{MassBank record creation workflow} \usage{ mbWorkflow(mb, steps = c(1, 2, 3, 4, 5, 6, 7, 8), - infolist_path = "./infolist.csv") + infolist_path = "./infolist.csv", useCTS = TRUE) } \arguments{ \item{steps}{Which steps in the workflow to perform.} @@ -13,6 +13,11 @@ manually inspected.} \item{mb}{The \code{mbWorkspace} to work in.} + + \item{useCTS}{A boolean variable denoting whether + to retrieve information using CTS or to use babel to + retrieve minimal information.} + } \value{ The processed \code{mbWorkspace}. diff --git a/man/validate.Rd b/man/validate.Rd index 7bbedf117ff3c12e44a281b59a96b06708fc5ef5..3c0b19deab568e8cef91983a89cada2c45660f96 100644 --- a/man/validate.Rd +++ b/man/validate.Rd @@ -8,6 +8,7 @@ validate(path, simple=TRUE) } \arguments{ \item{path}{The filepath to a single record, or a directory to search recursively} + \item{simple}{If TRUE the function creates a simpler form of the RUnit .html report, better readable for humans. If FALSE it returns the unchanged RUnit report.} } \description{