From 8608c194e10e74bbf64dcdf5503f2f7f33e7d669 Mon Sep 17 00:00:00 2001 From: Yohan Jarosz <yohanjarosz@yahoo.fr> Date: Fri, 17 Jul 2015 14:16:50 +0200 Subject: [PATCH] extract initialization from core workflow --- IMP | 51 +++++++++++++++++-- Snakefile | 77 ++--------------------------- config | 63 +++++++++++++++++++++++ init.rule | 118 ++++++++++++++++++++++++++++++++++++++++++++ src/config.imp.json | 3 +- 5 files changed, 233 insertions(+), 79 deletions(-) create mode 100644 config create mode 100644 init.rule diff --git a/IMP b/IMP index 8076863..9b6ae92 100755 --- a/IMP +++ b/IMP @@ -5,6 +5,9 @@ import subprocess import os import json import shlex +import snakemake +from copy import deepcopy +import tempfile __doc__ = """Integrated Metaomic Pipeline. ____ __ __ ____ @@ -14,16 +17,19 @@ __doc__ = """Integrated Metaomic Pipeline. Usage: IMP -m MG1 -m MG2 -t MT1 -t MT2 -o OUTPUT [--enter] [--norm] [-n CONTAINER] [-e ENV] ... [COMMANDS ...] + IMP --init [-d DBPATH] IMP (-h | --help) IMP --version Options: -e ENV Environment variable to pass to the container --enter Enter the container + --init Initialize IMP databases (Take a while) --norm Don't delete the container after use. Useful for debugging. -h --help Show this help and exit -m MG Path to the metagenomics paired files (must be 2 files). -t MT Path to the metatranscriptomic paired files (2 files). + -d DBPATH Path to the databases [default: db] -n CONTAINER Name of the container. Useful when you want to run a previous version of IMP. -o OUTPUT Path to the output directory """ @@ -35,6 +41,21 @@ def get_version(): ) +def dict_merge(a, b): + """ + Deep merge 2 dicts together + """ + if not isinstance(b, dict): + return b + result = deepcopy(a) + for k, v in b.items(): + if k in result and isinstance(result[k], dict): + result[k] = dict_merge(result[k], v) + else: + result[k] = deepcopy(v) + return result + + def yes_or_no(question): reply = str(input(question + ' (y/n): ')).lower().strip() if reply[0] == 'y': @@ -44,11 +65,23 @@ def yes_or_no(question): else: return yes_or_no("Please enter ") -if __name__ == '__main__': - args = docopt(__doc__, version=get_version(), options_first=True) +def init(args): CURRENT_PATH = Path(__file__).parent.abspath() + # start docker container to index files and setup prokka + container_name = args['-n'] is not None and args['-n'] or 'imp:latest' + db_path = Path(args['-d']).abspath() + cmd = [ + 'docker', 'run', '--rm', + '-v %s:/home/imp/integrated-metaomic-pipeline' % CURRENT_PATH, + '-v %s:/databases' % db_path, container_name, 'snakemake -s %s/' % CURRENT_PATH + ] + print("Executing", '"', ' '.join(cmd), '"') + subprocess.call(cmd) + +def run(args): + CURRENT_PATH = Path(__file__).parent.abspath() # find common path mg_data = [Path(p).abspath() for p in args['-m']] mt_data = [Path(p).abspath() for p in args['-t']] @@ -69,12 +102,13 @@ if __name__ == '__main__': else: exit(0) container_name = args['-n'] is not None and args['-n'] or 'imp:latest' - + db_path = Path(args['-d']).abspath() # configure IMP mount point to the docker container mount_points = [ '-v %s:/data' % common_path, '-v %s:/home/imp/integrated-metaomic-pipeline' % CURRENT_PATH, - '-v %s:/output' % output + '-v %s:/output' % output, + '-v %s:/databases' % db_path, ] # environement variables (add MG and MT data) @@ -107,3 +141,12 @@ if __name__ == '__main__': cmd = shlex.split(' '.join(cmd)) print("Executing", '"', ' '.join(cmd), '"') subprocess.call(cmd) + + +if __name__ == '__main__': + args = docopt(__doc__, version=get_version(), options_first=True) + + if args['--init']: + init(args) + else: + run(args) diff --git a/Snakefile b/Snakefile index 2de4da2..2575d8a 100644 --- a/Snakefile +++ b/Snakefile @@ -1,65 +1,5 @@ -import os -import shutil -import gzip -import json -import bz2 -from copy import deepcopy -import subprocess - - -def dict_merge(a, b): - """ - Deep merge 2 dicts together - """ - if not isinstance(b, dict): - return b - result = deepcopy(a) - for k, v in b.items(): - if k in result and isinstance(result[k], dict): - result[k] = dict_merge(result[k], v) - else: - result[k] = deepcopy(v) - return result - -# default configuration file -configfile: - "src/config.imp.json" - -# default executable for snakmake -shell.executable("bash") - -# custom configuration file -CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "conf/userconfig.imp.json") -# merge 2 configurations files together -if os.path.exists(CUSTOM_CONFIG_PATH): - with open(CUSTOM_CONFIG_PATH, 'r') as rhandle: - data = json.load(rhandle) - config = dict_merge(config, data) - - -# some parameters -SRCDIR = os.environ.get("SRCDIR", config['imp_src']) - -KOOPA = os.environ.get("KOOPA", None) -# get parameters from the command line -OUTPUTDIR = os.environ.get("OUTPUTDIR", config['outputdir']) -MG = os.environ.get("MG", config['raws']['Metagenomics']).split() -MT = os.environ.get("MT", config['raws']['Metatranscriptomics']).split() -SAMPLE = os.environ.get("SAMPLE", config['sample']) -DBPATH = os.environ.get("DBPATH", config['db_path']) -if not os.path.exists(DBPATH): - os.makedirs(DBPATH) - -# Get general parameters -THREADS = os.environ.get("THREADS", config['threads']) -MEMTOTAL = os.environ.get("MEMTOTAL", config['memory_total_gb']) -MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb']) - -# temporary directory will be stored inside the OUTPUTDIR directory -# unless a absolute path is set -TMPDIR = os.environ.get("TMPDIR", config['tmp_dir']) -if not os.path.isabs(TMPDIR): - TMPDIR = os.path.join(OUTPUTDIR, TMPDIR) +include: + "config" def prepare_environment(stepname): @@ -83,7 +23,7 @@ def prepare_environment(stepname): return out, os.path.join(out, '%s.log' % stepname) -# INCLUDES RULES +# INCLUDES PROCESSING RULES include: "rules/Util.rules" include: @@ -94,21 +34,10 @@ include: "rules/Analysis/master.rules" -# locate source directory and name scripts -src = lambda p: os.path.join(SRCDIR, p) - - rule ALL: input: preprocessing_output_files(), assembly_output_files(), analysis_output_files() - shell: "echo 'DONE'" - -rule MODULE_LOAD_TEST: - shell: - """ - IMPPRL="{config[preload][test]}"; if [[ -n $IMPPRL ]]; then $IMPPRL; fi - """ diff --git a/config b/config new file mode 100644 index 0000000..f469cbd --- /dev/null +++ b/config @@ -0,0 +1,63 @@ +import os +import shutil +import gzip +import json +import bz2 +from copy import deepcopy +import subprocess + + +def dict_merge(a, b): + """ + Deep merge 2 dicts together + """ + if not isinstance(b, dict): + return b + result = deepcopy(a) + for k, v in b.items(): + if k in result and isinstance(result[k], dict): + result[k] = dict_merge(result[k], v) + else: + result[k] = deepcopy(v) + return result + +# default configuration file +configfile: + "src/config.imp.json" + +# default executable for snakmake +shell.executable("bash") + +# custom configuration file +CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "conf/userconfig.imp.json") +# merge 2 configurations files together +if os.path.exists(CUSTOM_CONFIG_PATH): + with open(CUSTOM_CONFIG_PATH, 'r') as rhandle: + data = json.load(rhandle) + config = dict_merge(config, data) + + +# some parameters +SRCDIR = os.environ.get("SRCDIR", config['imp_src']) + +# get parameters from the command line +OUTPUTDIR = os.environ.get("OUTPUTDIR", config['outputdir']) +MG = os.environ.get("MG", config['raws']['Metagenomics']).split() +MT = os.environ.get("MT", config['raws']['Metatranscriptomics']).split() +SAMPLE = os.environ.get("SAMPLE", config['sample']) +DBPATH = os.environ.get("DBPATH", config['db_path']) +if not os.path.exists(DBPATH): + os.makedirs(DBPATH) + +# Get general parameters +THREADS = os.environ.get("THREADS", config['threads']) +MEMTOTAL = os.environ.get("MEMTOTAL", config['memory_total_gb']) +MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb']) + +# temporary directory will be stored inside the OUTPUTDIR directory +# unless a absolute path is set +TMPDIR = os.environ.get("TMPDIR", config['tmp_dir']) +if not os.path.isabs(TMPDIR): + TMPDIR = os.path.join(OUTPUTDIR, TMPDIR) +if not os.path.exists(TMPDIR): + os.makedirs(TMPDIR) diff --git a/init.rule b/init.rule new file mode 100644 index 0000000..3ba8ac2 --- /dev/null +++ b/init.rule @@ -0,0 +1,118 @@ +include: + "config" + +rule ALL: + input: + expand("{path}/{filter}.{ext}", path=DBPATH + "/human", filter=config["human_filtering"]["filter"], ext=['fa']), + expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna"), + expand("{path}/{db}", path=DBPATH, db=config["prokka"]["databases"]), + "%s/adapters/adapters.done" % DBPATH, + expand( + "{path}/idx/{files}.{ext}", + files=config["sortmerna"]["files"], + path=DBPATH + "/sortmerna", + ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats']) + +rule _DOWNLOAD_HUMAN_DB: + output: + expand("{path}/{filter}.{ext}", path=DBPATH + "/human", filter=config["human_filtering"]["filter"], ext=['fa']) + params: + filter = config["human_filtering"]["filter"], outdir = DBPATH + "/human" + shell: + """ + TMPD=$(mktemp -d -t --tmpdir={TMPDIR} "XXXXXX") + wget {config[human_filtering][url]} --no-check-certificate -O $TMPD/{params.filter}.fa.gz + gunzip $TMPD/{params.filter}.fa.gz + mkdir -p {params.outdir} + mv $TMPD/{params.filter}.fa {params.outdir} + rm -rf $TMPD + """ +print(TMPDIR) + +rule _DOWNLOAD_SORTMERNA_DATABASES: + output: + expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna") + shell: + """ + TMPD=$(mktemp -d -t --tmpdir={tmp} "XXXXXX") + wget {pkg_url} --no-check-certificate -O $TMPD/sortmerna.tgz + tar -xzf $TMPD/sortmerna.tgz --strip-components=1 -C $TMPD + mkdir -p {path} + mv $TMPD/rRNA_databases/*.fasta {path}/. + rm -rf $TMPD + """.format( + pkg_url=config["sortmerna"]["pkg_url"], + path=DBPATH + "/sortmerna", + tmp=TMPDIR + ) + +rule _DOWNLOAD_PROKKA_DATABASES: + output: + expand("{path}/{db}", path=DBPATH, db=config["prokka"]["databases"]) + shell: + """ + ### prokka by default will look databases where is located the binary. + ### we have to softlink to put the binary somewhere and the databases somewhere else. + if [[ "{DBPATH}" = /* ]] + then + PP={DBPATH}; + else + PP=$PWD/{DBPATH}; + fi + cd $(dirname $(which prokka))/.. && ln -s $PP db + echo "Softlinking $(dirname $(which prokka))/../db to $PP" + TMPDIR=$(mktemp -d -t "XXXXXX") + wget {config[prokka][pkg_url]} --no-check-certificate -O $TMPDIR/prokka.tgz + tar -xzf $TMPDIR/prokka.tgz --strip-components=1 -C $TMPDIR + mkdir -p {DBPATH} + cp -r $TMPDIR/db/* {DBPATH}/. + rm -rf $TMPDIR + prokka --setupdb + """ + + +rule INDEX_SORTMERNA_DB: + input: + expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna") + output: + expand( + "{path}/idx/{files}.{ext}", + files=config["sortmerna"]["files"], + path=DBPATH + "/sortmerna", + ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats']) + run: + fastaindexed = expand( + "{path}/idx/{files}", + files=config["sortmerna"]["files"], + path=DBPATH + "/sortmerna") + ref = ':'.join('%s,%s' % (a, b) for a, b in zip(input, fastaindexed)) + shell("mkdir -p {DBPATH}/sortmerna") + shell("indexdb_rna --ref {ref}") + + +rule INDEX_FASTA_FILE: + input: + "{fasta}" + output: + "{fasta}.amb", + "{fasta}.bwt", + "{fasta}.pac", + "{fasta}.sa", + "{fasta}.ann" + shell: + """ + bwa index {wildcards.fasta} > {log} 2>&1 + """ + + +rule _DOWNLOAD_TRIMMOMATIC_ADAPTERS: + output: + "{DBPATH}/adapters/adapters.done" + shell: + """ + wget --no-check-certificate {config[trimmomatic][pkg_url]} -O Trimmomatic-Src-0.32.zip + unzip Trimmomatic-Src-0.32.zip + cp -r trimmomatic-0.32/adapters {DBPATH} + rm Trimmomatic-Src-0.32.zip && rm -rf trimmomatic-0.32 + touch {output} + """ diff --git a/src/config.imp.json b/src/config.imp.json index 5a8ba78..a1a09ce 100644 --- a/src/config.imp.json +++ b/src/config.imp.json @@ -10,9 +10,10 @@ }, "sample": "test", "outputdir": "/output", - "db_path": "db", + "db_path": "/databases", "preprocessing_filtering": true, "trimmomatic": { + "pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip", "adapter": "TruSeq3", "leading": 20, "minlen": 40, -- GitLab