.listEnvEnv <- new.env() assign("listEnv", NULL, envir=.listEnvEnv) #' Load compound list for RMassBank #' #' Loads a CSV compound list with compound IDs #' #' The list is loaded into the variable \code{\var{compoundList}} in the environment #' \code{listEnv} (which defaults to the global environment) and used by #' the \code{findMz}, \code{findCAS}, ... functions. The CSV file is required to have at least the following columns, which are used for #' further processing and must be named correctly (but present in any order): \var{ID, Name, SMILES, RT, #' CAS} #' #' resetList() clears a currently loaded list. #' #' @aliases loadList resetList #' @usage loadList(path, listEnv=NULL, check=TRUE) #' #' resetList() #' @param path Path to the CSV list. #' @param listEnv The environment to load the list into. By default, the namelist is loaded #' into an environment internally in RMassBank. #' @param check A parameter that specifies whether the SMILES-Codes in the list should be checked for readability by rcdk. #' @return No return value. #' @author Michael Stravs #' @seealso \code{\link{findMz}} #' @examples #' #' ## #' \dontrun{loadList("mylist.csv")} #' #' @export loadList <- function(path, listEnv = NULL, check = TRUE) { if(is.null(listEnv)) listEnv <- .listEnvEnv if(!file.exists(path)) stop("The supplied file does not exist, please supply a correct path") # Try out if the file is comma- or semicolon-separated compoundList <- read.csv(path, stringsAsFactors=FALSE, check.names=FALSE) n <- colnames(compoundList) if(!("ID" %in% n)){ # If no ID column, it must be semicolon separated compoundList <- read.csv2(path, stringsAsFactors=FALSE, check.names=FALSE) n <- colnames(compoundList) if(!("ID" %in% n)){ # ...or there must be something wrong with the column names stop("There is no 'ID' column in the compound list") } } # Now everything should be fine, at least regarding csv and ssv # Are there duplicate compound IDs? if(any(duplicated(compoundList$ID))){ stop("Duplicate compound IDs are present. Please check the compound list.") } # Do all Compoundlist IDs have 4 characters or less? if(any(nchar(compoundList$ID) > 4)){ stop("The maximum number of digits for compound IDs is 4") } # Evaluate if all strictly necessary columns are in the list cols <- c('ID', 'Name', 'SMILES', 'RT', 'CAS') d <- setdiff(cols, n) if(length(d)>0){ stop(paste("Some columns are missing in the compound list. It needs at least", paste(cols,collapse=", "))) } # Is there a "Level" column? if("Level" %in% n){ newList <- TRUE } else { newList <- FALSE } # Assign for now... assign("listEnv", listEnv, envir=.listEnvEnv) .listEnvEnv$listEnv$compoundList <- compoundList # If "level" is in the compound list we have to check several things: if(newList){ # a) Are the levels part of the defined levels? # b) Are the values ok for every level? (i.e. all necessary values supplied for each line in the compound list?) # Check a) (and translate to number levels because handling numbers and text would make the if-statements here even more confusing) compoundList$Level <- sapply(as.character(compoundList$Level),.translateLevel) # Need this variable for levels 3b, 3c, 3d, 4 and potentially 3 and 3a formulaColPresent <- "Formula" %in% n if(any(c("3b","3c","3d","4") %in% compoundList$Level)){ if(!(formulaColPresent)){ .listEnvEnv$listEnv$compoundList <- NULL stop('The compound list must contain the column "Formula" if a tentative or formula compound (levels 3b, 3c, 3d, 4) is present') } } # Need this variable for level 5 if("5" %in% compoundList$Level){ mzCol <- which(c("mz","m/z","mass","exactMass") %in% n) if(length(mzCol) > 1){ .listEnvEnv$listEnv$compoundList <- NULL stop("The compound list can only contain one column with one of the following column names: 'mz','m/z','mass','exactMass' if an unknown compound (level 5) is present") } if(mzCol == 2){ n[which(n == "m/z")] <- "mz" } if(mzCol == 4){ n[which(n == "exactMass")] <- "mass" } .listEnvEnv$listEnv$mzCol <- c("mz","m/z","mass","exactMass")[mzCol] colnames(compoundList) <- n } # Check b) for(i in 1:length(compoundList$Level)){ level <- compoundList$Level[i] # ID must have numbers as values if(!is.numeric(compoundList[i,"ID"])){ .listEnvEnv$listEnv$compoundList <- NULL stop(paste("Value for 'ID' in line", i, "of the compound list is not a valid compound ID")) } # If level is "1x" or "2x", the SMILES must be supplied and must be correct if(level %in% c("0","1","1a","1b","1c","2","2a","2b")){ currEnvir <- environment() tryCatch( findMz(compoundList[i,"ID"]), error = function(e){ .listEnvEnv$listEnv$compoundList <- NULL stop(paste("'SMILES' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid SMILES")) } ) } # If level is "3" or "3a", a valid smiles or formula must be supplied if(level %in% c("3","3a")){ if(!is.na(findSmiles(compoundList[i,"ID"]))){ tryCatch( findMz(compoundList[i,"ID"]), error = function(e){ .listEnvEnv$listEnv$compoundList <- NULL stop(paste("'SMILES' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid SMILES")) } ) } else{ if(!formulaColPresent){ .listEnvEnv$listEnv$compoundList <- NULL stop(paste("Compound", compoundList[i,"ID"], "in line", i, "of the compound list is marked as level 3, but there is no valid SMILES value nor a 'Formula' column present")) } tryCatch( rcdk::get.formula(findFormula(compoundList[i,"ID"], retrieval="tentative")), error = function(e){ .listEnvEnv$listEnv$compoundList <- NULL stop(paste("'Formula' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid Formula")) } ) } } # If level is "3b","3c","3d" or "4", a valid formula must be supplied if(level %in% c("3b","3c","3d","4")){ tryCatch( rcdk::get.formula(findFormula(compoundList[i,"ID"], retrieval="tentative")), error = function(e){ .listEnvEnv$listEnv$compoundList <- NULL stop(paste("'Formula' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid Formula")) } ) } # If level is "5", m/z must be supplied if(level == "5"){ if(!is.numeric(findMz(compoundList[i,"ID"], retrieval="unknown")$mzCenter)){ .listEnvEnv$listEnv$compoundList <- NULL stop(paste(c("mz","m/z","mass","exactMass")[mzCol],"value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid number")) } } } } # If "Level" is not in the compound list it MUST be a standard list, so process just as before: if(!newList){ cols <- c('ID', 'Name', 'SMILES', 'RT', 'CAS') d <- setdiff(cols, n) if(length(d)>0){ stop("Some columns are missing in the compound list. Needs at least ID, Name, SMILES, RT, CAS.") } ### ###Test if all the IDs work ### if(check){ wrongID <- vector() currEnvir <- environment() sapply(compoundList[,"ID"], function(x){ tryCatch( findMz(x), error = function(e){ currEnvir$wrongID <- c(currEnvir$wrongID, x) } ) }) if(length(wrongID)){ .listEnvEnv$listEnv$compoundList <- NULL stop(paste("Unable to interpret the SMILES-strings for ID(s)", paste(wrongID, collapse= " "), "\nPlease check these and load the list again.")) } } Level <- rep("0",nrow(compoundList)) .listEnvEnv$listEnv$compoundList <- cbind(compoundList,Level) } message("Loaded compoundlist successfully") } #' @export resetList <- function() { if(is.null(.listEnvEnv$listEnv)) return() if(!exists("compoundList", where=.listEnvEnv$listEnv)) return() rm(.listEnvEnv$listEnv$compoundList) assign("listEnv", NULL, envir=.listEnvEnv) } # Function that translates one entry of the level column of the compound list into the number system .translateLevel <- function(level){ if(!(level %in% c("0","1","1a","1b","1c","2","2a","2b","3","3a","3b","3c","3d","4","5"))){ switch(level, standard={ return("1a") }, parent={ return("1b") }, confirmed={ return("1c") }, probable={ return("2") }, probableLibrary={ return("2a") }, probableDiagnostic={ return("2b") }, tentative={ return("3") }, tentativeStructure={ return("3a") }, tentativeIsomer={ return("3b") }, tentativeTPClass={ return("3c") }, tentativeBestMatch={ return("3d") }, formula={ return("4") }, unknown={ return("5") }, exactMass={ return("5") }, { stop(paste(level, "is not a valid level")) } ) } else{ return(level) } } #' Create Rcdk molecule from SMILES #' #' Generates a Rcdk molecule object from SMILES code, which is fully typed and #' usable (in contrast to the built-in \code{parse.smiles}). #' #' \bold{NOTE: As of today (2012-03-16), Rcdk discards stereochemistry when #' loading the SMILES code!} Therefore, do not trust this function blindly, #' e.g. don't generate InChI keys from the result. It is, however, useful if #' you want to compute the mass (or something else) with Rcdk. #' #' @usage getMolecule(smiles) #' @param smiles The SMILES code of the compound. #' @return A Rcdk \code{IAtomContainer} reference. #' @author Michael Stravs #' @seealso \code{\link{parse.smiles}} #' @examples #' #' # Lindane: #' getMolecule("C1(C(C(C(C(C1Cl)Cl)Cl)Cl)Cl)Cl") #' # Benzene: #' getMolecule("C1=CC=CC=C1") #' #' @export getMolecule <- function(smiles) { capture.output(mol <- parse.smiles(smiles)[[1]]) do.aromaticity(mol) convert.implicit.to.explicit(mol) do.aromaticity(mol) do.typing(mol) do.isotopes(mol) return(mol) } #' Find the exact mass +/- a given margin for a given formula or its ions and adducts. #' #' @param formula The molecular formula in text or list format (see \code{\link{formulastring.to.list}} #' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions #' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). "" for the uncharged molecule. #' @param ppm The ppm margin to add/subtract #' @param deltaMz The absolute mass to add/subtract. Cumulative with \code{ppm} #' @return A \code{list(mzMin=, mzCenter=, mzMax=)} with the masses. #' @author Michael A. Stravs, Eawag <michael.stravs@@eawag.ch> #' @examples findMz.formula("C6H6") #' @seealso \code{\link{findMz}} #' @export findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) { if (!any(mode %in% c("pH", "pNa", "pM", "pNH4", "mH", "mFA", "mM", ""))) stop(paste("The ionization mode", mode, "is unknown.")) mzopt <- list(addition = "", charge = 0) if (mode == "pH") mzopt <- list(addition = "H", charge = 1) if (mode == "pNa") mzopt <- list(addition = "Na", charge = 1) if (mode == "pM") mzopt <- list(addition = "", charge = 1) if (mode == "pNH4") mzopt <- list(addition = "NH4", charge = -1) if (mode == "mH") mzopt <- list(addition = "H-1", charge = -1) if (mode == "mFA") mzopt <- list(addition = "C1O2", charge = -1) if (mode == "mM") mzopt <- list(addition = "", charge = -1) if (mode == "") mzopt <- list(addition = "", charge = 0) formula <- add.formula(formula, mzopt$addition) # Since in special cases we want to use this with negative and zero number of atoms, we account for this case # by splitting up the formula into positive and negative atom counts (this eliminates the zeroes.) formula.split <- split.formula.posneg(formula) m <- 0 if(formula.split$pos != "") { formula.pos <- get.formula(formula.split$pos, charge = mzopt$charge) m = m + formula.pos@mass } if(formula.split$neg != "") { formula.neg <- get.formula(formula.split$neg, charge = -mzopt$charge) m = m - formula.neg@mass } if((nchar(formula.split$pos)==0) & (nchar(formula.split$neg)==0)) { m <- get.formula("H", charge = mzopt$charge)@mass - get.formula("H", charge = 0)@mass } delta <- ppm(m, ppm, l = TRUE) return(list(mzMin = delta[[2]] - deltaMz, mzMax = delta[[1]] + deltaMz, mzCenter = m)) } #' Find compound information #' #' Retrieves compound information from the loaded compound list or calculates #' it from the SMILES code in the list. #' #' @aliases findMz findSmiles findFormula findRt findCAS findName findLevel #' @usage findMz(cpdID, mode = "pH", ppm = 10, deltaMz = 0, retrieval="standard") #' #' findRt(cpdID) #' #' findSmiles(cpdID) #' #' findFormula(cpdID, retrieval="standard") #' #' findCAS(cpdID) #' #' findName(cpdID) #' #' findLevel(cpdID, compact=FALSE) #' @param cpdID The compound ID in the compound list. #' @param mode Specifies the species of the molecule: An empty string specifies #' uncharged monoisotopic mass, \code{\var{pH}} (positive H) specifies [M+H]+, #' \code{\var{pNa}} specifies [M+Na]+, \code{\var{pM}} specifies [M]+, #' \code{\var{mH}} and \code{\var{mFA}} specify [M-H]- and [M+FA]-, #' respectively. (I apologize for the naming of \code{\var{pH}} which has #' absolutely nothing to do with chemical \emph{pH} values.) #' @param ppm Specifies ppm window (10 ppm will return the range of the #' molecular mass + and - 10 ppm). #' @param deltaMz Specifies additional m/z window to add to the range (deltaMz #' = 0.02 will return the range of the molecular mass +- 0.02 (and additionally #' +- the set ppm value). #' @param retrieval A value that determines whether the files should be handled either as "standard", #' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" #' if the only know thing is the m/z #' @param compact Only for \code{findLevel}, returns the "retrieval" parameter used for many functions #' within RMassBank if TRUE #' @return \code{findMz} will return a \code{list(mzCenter=, mzMin=, mzMax=)} #' with the molecular weight of the given ion, as calculated from the SMILES #' code and Rcdk. #' #' \code{findRt}, \code{findSmiles},\code{findCAS},\code{findName} will return #' the corresponding entry from the compound list. \code{findFormula} returns #' the molecular formula as determined from the SMILES code. #' #' @author Michael Stravs #' @seealso \code{\link{findMass}}, \code{\link{loadList}}, \code{\link{findMz.formula}} #' @examples #' #' \dontrun{% #' findMz(123, "pH", 5) #' findFormula(123) #' } #' #' @export findMz <- function(cpdID, mode="pH", ppm=10, deltaMz=0, retrieval="standard") { # In case of unknown: m/z is in table if(retrieval == "unknown"){ mz <- .listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),.listEnvEnv$listEnv$mzCol] delta <- ppm(mz, ppm, l=TRUE) return(list(mzMin=delta[2] - deltaMz, mzMax=delta[1] + deltaMz, mzCenter=mz)) } # In case of tentative: calculate mass from formula if(retrieval == "tentative"){ return(findMz.formula(findFormula(cpdID, "tentative"),mode=mode)) } # All other cases: Use smiles to calculate mass if(retrieval == "standard"){ s <- findSmiles(cpdID) #if(s=="-") s <- NA if(is.na(s)){ stop("There was no matching SMILES-entry to the supplied cpdID(s) \n Please check the cpdIDs and the compoundlist.") return(list(mzMin=NA,mzMax=NA,mzCenter=NA)) } formula <- .get.mol2formula(getMolecule(s)) return(findMz.formula(formula@string, mode, ppm, deltaMz)) } } #findMz <- function(...) findMz.rcdk(...) #' @export findRt <- function(cpdID) { if(is.null(.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") if(!exists("compoundList", where=.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") if(is.character(cpdID)) cpdID <- as.numeric(cpdID) rt <- as.numeric(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"RT"]) if(!is.null(getOption("RMassBank")$rtShift)) rt <- rt + getOption("RMassBank")$rtShift if(is.na(rt)) return(list(RT=NA)) else return(list(RT=rt)) } #' @export findSmiles <- function(cpdID) { if(is.character(cpdID)) cpdID <- as.numeric(cpdID) if(is.null(.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") if(!exists("compoundList", where=.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") if(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"] == "") return(NA) return(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"]) } #' @export findFormula <- function(cpdID, retrieval = "standard") { # In case of tentative: read formula from table if(retrieval=="tentative"){ return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Formula"]) } # Otherwise: Convert smiles to formula if(retrieval=="standard"){ smiles <- findSmiles(cpdID) mol <- getMolecule(smiles) f <- .get.mol2formula(mol) return(f@string) } } #' @export findCAS <- function(cpdID) { if(is.character(cpdID)) cpdID <- as.numeric(cpdID) if(is.null(.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") if(!exists("compoundList", where=.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"CAS"]) } #' @export findName <- function(cpdID) { if(is.character(cpdID)) cpdID <- as.numeric(cpdID) if(is.null(.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") if(!exists("compoundList", where=.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Name"]) } #' @export findLevel <- function(cpdID, compact=FALSE) { if(is.character(cpdID)) cpdID <- as.numeric(cpdID) if(is.null(.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") if(!exists("compoundList", where=.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") if(compact){ level <- .listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Level"] if(level %in% c("0","1","1a","1b","1c","2","2a","2b","3","3a")){ return("standard") } if(level %in% c("3b","3c","3d","4")){ return("tentative") } if(level == "5"){ return("unknown") } return("tentative") } return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Level"]) } #' Calculate exact mass #' #' Retrieves the exact mass of the uncharged molecule. It works directly from #' the SMILES and therefore is used in the MassBank workflow #' (\code{\link{mbWorkflow}}) - there, all properties are calculated from the #' SMILES code retrieved from the database. (Alternatively, takes also the #' compound ID as parameter and looks it up.) Calculation relies on Rcdk. #' #' @param cpdID_or_smiles SMILES code or compound ID of the molecule. (Numerics #' are treated as compound ID). #' @param retrieval A value that determines whether the files should be handled either as "standard", #' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" #' if the only know thing is the m/z #' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions #' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). #' Only needed for retrieval="unknown" #' @return Returns the exact mass of the uncharged molecule. #' @author Michael Stravs #' @seealso \code{\link{findMz}} #' @examples #' #' ## #' findMass("OC[C@@H]1OC(O)[C@@H](O)[C@@@@H](O)[C@@@@H]1O") #' #' @export findMass <- function(cpdID_or_smiles, retrieval="standard", mode = "pH") { # Must calculate mass manually if no formula is given if(retrieval == "unknown"){ if(mode == "pH") { mass <- 1.00784 mode.charge <- 1 } else if(mode == "pNa") { mass <- 22.989769 mode.charge <- 1 } else if(mode == "pM") { mass <- 0 mode.charge <- 1 } else if(mode == "mM") { mass <- 0 mode.charge <- -1 } else if(mode == "mH") { mass <- -1.00784 mode.charge <- -1 } else if(mode == "mFA") { mass <- 59.0440 mode.charge <- -1 } else if(mode == "pNH4") { mass <- 18.03846 mode.charge <- 1 } else{ stop("mode = \"", mode, "\" not defined") } return(findMz(cpdID_or_smiles, mode=mode, retrieval=retrieval)$mzCenter - mass + mode.charge * .emass) } # Calculate mass from formula if(retrieval == "tentative"){ return(get.formula(findFormula(cpdID_or_smiles, "tentative"))@mass) } # Calculate mass from SMILES if(retrieval == "standard"){ if(!is.numeric(cpdID_or_smiles)) s <- cpdID_or_smiles else s <- findSmiles(cpdID_or_smiles) mol <- getMolecule(s) return(get.exact.mass(mol)) } }