Gitlab migration complete. If you have any issue please read the FAQ.

Verified Commit bfbb94b4 authored by Todor Kondic's avatar Todor Kondic
Browse files

Update mk_comp_tab system: make it more efficient.

* R/api.R(mk_comp_tab): Updated. Removed dependency on
calc_mz_from_smiles, instead split cpd lists into knowns and unknowns,
calculate formulas first, then the mz, then rebind knowns and
unknowns.

* R/mix.R(calc_mz_from_smiles): Removed.

(smiles2form): New function. Take list of smiles and turn them into
formulas.

(calc_mz_from_formula): Updated. Account for zero-length input.
parent 9ffaa21d
......@@ -174,21 +174,67 @@ mk_comp_tab <- function(m) {
comp <- cmpds[setid,on="ID"][mzml,.(tag,adduct,ID,RT,set,Name,file,SMILES,Formula,mz,known),on="set",allow.cartesian=T]
tab2file(tab=comp,file=paste0("setidmerge",".csv"))
setkey(comp,known,set,ID)
message("Merged all sets.")
message("Calculate formulas from SMILES (if any). Please wait.")
## Get just the info needed for mz calculation.
comp_known <- comp[known=="structure" | known=="formula"]
## Remove mz==NA col from knowns.
comp_known[,mz:=NULL]
comp_unknown <- comp[known=="mz"]
smiforadd <- comp_known[known=="structure" | known=="formula",unique(.SD),.SDcols=c("adduct","ID","SMILES","Formula")]
## Known structure.
## comp[,`:=`(mz=mapply(calc_mz_from_smiles,SMILES,adduct,ID,USE.NAMES = F))]
comp[known=="structure",`:=`(mz=calc_mz_from_smiles(SMILES,adduct,ID))]
## Known formula.
comp[known=="formula",`:=`(mz=calc_mz_from_formula(Formula,adduct,ID))]
## Turn SMILES into formulas.
smiles <- smiforadd[,unique(.SD),.SDcols=c("SMILES")]
smiles[,`:=`(Formula=smiles2form(SMILES))]
badsmiles <- as.character(smiles[Formula=="",SMILES])
if (length(badsmiles)>0) {
stop("Unable to create formula from SMILES:",paste(badsmiles,collapse="\n"))
}
smiforadd <- smiles[smiforadd,.(ID,SMILES,Formula,adduct),on=c("SMILES")]
data.table::setkey(smiforadd,"adduct","ID")
## Update the intermediate table with masses.
message("Formulas have been calculated. Start calculating masses from formulas.")
smiforadd[,mz:=calc_mz_from_formula(Formula,adduct,ID)]
message("Mass calculation has been completed.")
## Update the whole comprehensive table with masses from
## formulas. Doing it in a merge leaves a mess that has to be
## cleaned.
comp2 <- merge(comp_known,smiforadd,all.x = T, by= c("adduct","ID"))
## Take Formulas from smiforadd (y) and SMILES from comp (x).
comp2[,`:=`(Formula=Formula.y,SMILES=SMILES.x)]
## Now, populate mz from smiforadd (y) if SMILES/formula known,
## else take what was in the comp (x).
## comp2[,mz:=fifelse(known=="structure" | known=="formula",mz.y,mz.x)]
nms <- names(comp)
comp_known<-comp2[,..nms]
## In case you were wondering why is this all so complicated,
## well, for the moment I do not want to exclude mixed knowns and
## unknowns in the same run. The unknowns would have masses filled
## already at the stage of the compound list, so thay are taken
## from comp_unknown. Another twist is that mz can be calculated
## from either SMILES, or Formula.
## Combine knowns and unknowns finally.
comp <- rbind(comp_known,comp_unknown)
## Rename stuff to be renamed and reorder columns.
setnames(comp,names(COMP_NAME_MAP),
function(o) COMP_NAME_MAP[[o]])
setcolorder(comp,COMP_NAME_FIRST)
## Write it out.
fn_out <- get_fn_comp(m)
tab2file(tab=comp,file=fn_out)
message("Generation of comp table finished.")
message("Generation of comprehensive table finished.")
## Index for fast search and access.
setkeyv(comp,c("set","tag","mz"))
m$out$tab$comp <- comp
## TODO: Not tested on cases when there are both knowns and
## unknowns present in the compound lists. It *should* work
## though.
m
}
......
......@@ -77,6 +77,7 @@ calc_mz_from_formula_outer <- function(chform,adduct,id) {
}
calc_mz_from_formula <- function(chform,adduct,id) {
if (length(chform) == 0 ) return(numeric(0))
check_chform <- enviPat::check_chemform(ISOTOPES,chform)
wind <- which(check_chform$warning)
if (length(wind) > 0) stop("Cannot understand the following formulas: ",
......@@ -128,19 +129,21 @@ calc_mz_from_formula <- function(chform,adduct,id) {
mz
}
calc_mz_from_smiles <- function(smiles,adduct,id) {
mol <- lapply(smiles,function(s) try(RMassBank::getMolecule(s), silent = T))
check <- which(is.atomic(mol))
if (length(check) > 0)
stop("Errors in SMILES with IDs:",paste(id[which],collapse = ','))
mol_form <- sapply(mol,function(x) (rcdk::get.mol2formula(x))@string,USE.NAMES = F)
names(mol_form) <- id
calc_mz_from_formula(mol_form,adduct,id)
smiles2form <- function(smiles) {
one2form <- function (s) {
mol <- try(RMassBank::getMolecule(s), silent = T)
if (!is.atomic(mol)) {
(rcdk::get.mol2formula(mol))@string
} else ""
}
sapply(smiles,one2form,USE.NAMES = F)
}
calc_mz_from_smiles_outer <- function(smiles,adduct,id) {
mol <- lapply(smiles,function(s) try(RMassBank::getMolecule(s), silent = T))
check <- which(is.atomic(mol))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment