diff --git a/DESCRIPTION b/DESCRIPTION index 5f46eca1efab0e58a02c93982a074acc7f381d49..a5a71702ae251131dc3c3ea4f8fa577e63fde978 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 1.99.7 +Version: 1.99.8 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), @@ -29,7 +29,7 @@ Depends: Encoding: UTF-8 Imports: XML,RCurl,rjson, - rcdk,yaml,mzR,methods,Biobase,MSnbase,S4Vectors + rcdk,yaml,mzR,methods,Biobase,MSnbase,S4Vectors,digest Suggests: gplots,RMassBankData, xcms (>= 1.37.1), @@ -41,6 +41,7 @@ Collate: 'alternateAnalyze.R' 'createMassBank.R' 'formulaCalculator.R' + 'getSplash.R' 'leCsvAccess.R' 'leMsMs.r' 'leMsmsRaw.R' diff --git a/NAMESPACE b/NAMESPACE index e2b792db594fda8fe65ad0f7d5483ef8faf69471..fd347b58f54d556c9461bb4c00052206a0b6320d 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -134,3 +134,4 @@ import(mzR) import(rcdk) import(rjson) import(yaml) +import(digest) diff --git a/R/createMassBank.R b/R/createMassBank.R index a3e1c1f7ce282adc35535a8581a25baa85eed07d..0ef2c67fdd617e9679d7d3f8af7a80c68d28423b 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -576,6 +576,7 @@ gatherData <- function(id) mbdata[["COMMENT"]] <- list() mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment mbdata[["COMMENT"]][["ID"]] = id + # here compound info starts mbdata[['CH$NAME']] <- names # Currently we use a fixed value for Compound Class, since there is no useful @@ -758,6 +759,7 @@ gatherDataBabel <- function(id){ mbdata[["COMMENT"]] <- list() mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment mbdata[["COMMENT"]][["ID"]] <- id + # here compound info starts mbdata[['CH$NAME']] <- as.list(dbname) @@ -1247,6 +1249,13 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP # Annotation: if(getOption("RMassBank")$add_annotation==TRUE) mbdata[["PK$ANNOTATION"]] <- annotation + + ## The SPLASH is a hash value calculated across all peaks + ## http://splash.fiehnlab.ucdavis.edu/ + ## Has to be temporarily added as "PK$SPLASH" in the "lower" part + ## of the record, but will later be moved "up" when merging parts in compileRecord() + mbdata[["PK$SPLASH"]] = getSplash(peaks[,c("m/z", "int.")]) + # Peak table mbdata[["PK$NUM_PEAK"]] <- peaknum mbdata[["PK$PEAK"]] <- peaks @@ -1334,6 +1343,11 @@ compileRecord <- function(spec, mbdata, aggregated, additionalPeaks = NULL) # Note that the accession number and record title (in the upper part) must of course # be filled in with scan-specific info. mbrecord <- c(mbdata, l) + + ## Move the SPLASH from the lower part to the upper part + mbrecord[["COMMENT"]][["SPLASH"]] <- mbrecord[["PK$SPLASH"]] + mbrecord[["PK$SPLASH"]] <- NULL + # Here is the right place to fix the name of the INTERNAL ID field. names(mbrecord[["COMMENT"]])[[which(names(mbrecord[["COMMENT"]]) == "ID")]] <- getOption("RMassBank")$annotations$internal_id_fieldname diff --git a/R/getSplash.R b/R/getSplash.R new file mode 100644 index 0000000000000000000000000000000000000000..9a6e9677d1820ebe85d1afa540e6b6769e19b0b1 --- /dev/null +++ b/R/getSplash.R @@ -0,0 +1,66 @@ + +## Some constants +BINS <- 10; +BIN_SIZE <- 100; +EPS_CORRECTION = 1.0e-7 + +## FINAL_SCALE_FACTOR <- 9; ## Base 10 +FINAL_SCALE_FACTOR <- 35; ## Base 36 + +decimalavoidance <- function(x) { + format(floor (x * 1000000), scientific=FALSE) +} + +getBlockHash <- function(peaks, + RELATIVE_INTENSITY_SCALE=100, + MAX_HASH_CHARATERS_ENCODED_SPECTRUM=20) { + max_intensity = max(peaks[,2]) + + ## Scale to maximum intensity + peaks[,2] <- as.integer(peaks[,2] / max(peaks[,2]) * RELATIVE_INTENSITY_SCALE + EPS_CORRECTION) + + ## Sorted by ascending m/z and ties broken by descending intensity + o <- order(peaks[,1], -1*peaks[,2], decreasing=FALSE) + + peakString <- paste(apply(peaks[o,,drop=FALSE], MARGIN=1, + FUN=function(x) {paste(c(decimalavoidance(x[1]+EPS_CORRECTION), x[2]), + collapse=":")}), collapse=" ") + + ## cat("Prehash: ", peakString, sep="", file="/tmp/prehash-R") ## Debugging of spectrum-string + block2 <- substr(digest(peakString, algo="sha256", serialize=FALSE), + 1, MAX_HASH_CHARATERS_ENCODED_SPECTRUM) +} + +integer2base36 <- function(i) { + integer2base36code <- sapply(c(48:57, 97:122), function(i) rawToChar(as.raw(i))) + paste(integer2base36code[i+1], collapse="") +} + +getBlockHist <- function(peaks) { + ## Initialise output + wrappedhist <- integer(BINS) + + binindex <- as.integer(peaks[,1] / BIN_SIZE) + + summedintensities <- tapply(peaks[,2], binindex, sum) + wrappedbinindex <- unique(binindex) %% BINS + wrappedintensities <- tapply(summedintensities, wrappedbinindex, sum) + normalisedintensities <- as.integer(wrappedintensities/max(wrappedintensities)*FINAL_SCALE_FACTOR) + + wrappedhist[sort(unique(wrappedbinindex))+1] <- normalisedintensities + paste(integer2base36(wrappedhist), collapse="") + +} + +getSplash <- function(peaks) { + + block2 <- getBlockHist(peaks) + block3 <- getBlockHash(peaks) + + splash <- paste("splash10", + block2, + block3, + sep="-") + + return(splash) +}