extract initialization from core workflow

8608c194 · Yohan Jarosz · 40ed497a · 8608c194 · 8608c194 · 8608c194
Commit 8608c194 authored 9 years ago by Yohan Jarosz
--- a/IMP
+++ b/IMP
@@ -5,6 +5,9 @@ import subprocess
 import os
 import json
 import shlex
+import snakemake
+from copy import deepcopy
+import tempfile
 __doc__ = """Integrated Metaomic Pipeline.
 ____  __  __  ____
@@ -14,16 +17,19 @@ __doc__ = """Integrated Metaomic Pipeline.
 Usage:
  IMP -m MG1 -m MG2 -t MT1 -t MT2 -o OUTPUT [--enter] [--norm] [-n CONTAINER] [-e ENV] ... [COMMANDS ...]
+  IMP --init [-d DBPATH]
  IMP (-h | --help)
  IMP --version
 Options:
  -e ENV            Environment variable to pass to the container
  --enter           Enter the container
+  --init            Initialize IMP databases (Take a while)
  --norm            Don't delete the container after use. Useful for debugging.
  -h --help         Show this help and exit
  -m MG             Path to the metagenomics paired files (must be 2 files).
  -t MT             Path to the metatranscriptomic paired files (2 files).
+  -d DBPATH         Path to the databases [default: db]
  -n CONTAINER      Name of the container. Useful when you want to run a previous version of IMP.
  -o OUTPUT         Path to the output directory
 """
@@ -35,6 +41,21 @@ def get_version():
    )
+def dict_merge(a, b):
+    """
+    Deep merge 2 dicts together
+    """
+    if not isinstance(b, dict):
+        return b
+    result = deepcopy(a)
+    for k, v in b.items():
+        if k in result and isinstance(result[k], dict):
+            result[k] = dict_merge(result[k], v)
+        else:
+            result[k] = deepcopy(v)
+    return result
 def yes_or_no(question):
    reply = str(input(question + ' (y/n): ')).lower().strip()
    if reply[0] == 'y':
@@ -44,11 +65,23 @@ def yes_or_no(question):
    else:
        return yes_or_no("Please enter ")
-if __name__ == '__main__':
-    args = docopt(__doc__, version=get_version(), options_first=True)
+def init(args):
    CURRENT_PATH = Path(__file__).parent.abspath()
+    # start docker container to index files and setup prokka
+    container_name = args['-n'] is not None and args['-n'] or 'imp:latest'
+    db_path = Path(args['-d']).abspath()
+    cmd = [
+        'docker', 'run', '--rm',
+        '-v %s:/home/imp/integrated-metaomic-pipeline' % CURRENT_PATH,
+        '-v %s:/databases' % db_path, container_name, 'snakemake -s %s/' % CURRENT_PATH
+    ]
+    print("Executing", '"', ' '.join(cmd), '"')
+    subprocess.call(cmd)
+def run(args):
+    CURRENT_PATH = Path(__file__).parent.abspath()
    # find common path
    mg_data = [Path(p).abspath() for p in args['-m']]
    mt_data = [Path(p).abspath() for p in args['-t']]
@@ -69,12 +102,13 @@ if __name__ == '__main__':
        else:
            exit(0)
    container_name = args['-n'] is not None and args['-n'] or 'imp:latest'
+    db_path = Path(args['-d']).abspath()
    # configure IMP mount point to the docker container
    mount_points = [
        '-v %s:/data' % common_path,
        '-v %s:/home/imp/integrated-metaomic-pipeline' % CURRENT_PATH,
-        '-v %s:/output' % output
+        '-v %s:/output' % output,
+        '-v %s:/databases' % db_path,
    ]
    # environement variables (add MG and MT data)
@@ -107,3 +141,12 @@ if __name__ == '__main__':
    cmd = shlex.split(' '.join(cmd))
    print("Executing", '"', ' '.join(cmd), '"')
    subprocess.call(cmd)
+if __name__ == '__main__':
+    args = docopt(__doc__, version=get_version(), options_first=True)
+    if args['--init']:
+        init(args)
+    else:
+        run(args)
--- a/Snakefile
+++ b/Snakefile
-import os
+include:
-import shutil
+    "config"
-import gzip
-import json
-import bz2
-from copy import deepcopy
-import subprocess
-def dict_merge(a, b):
-    """
-    Deep merge 2 dicts together
-    """
-    if not isinstance(b, dict):
-        return b
-    result = deepcopy(a)
-    for k, v in b.items():
-        if k in result and isinstance(result[k], dict):
-            result[k] = dict_merge(result[k], v)
-        else:
-            result[k] = deepcopy(v)
-    return result
-# default configuration file
-configfile:
-    "src/config.imp.json"
-# default executable for snakmake
-shell.executable("bash")
-# custom configuration file
-CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "conf/userconfig.imp.json")
-# merge 2 configurations files together
-if os.path.exists(CUSTOM_CONFIG_PATH):
-    with open(CUSTOM_CONFIG_PATH, 'r') as rhandle:
-        data = json.load(rhandle)
-        config = dict_merge(config, data)
-# some parameters
-SRCDIR = os.environ.get("SRCDIR", config['imp_src'])
-KOOPA = os.environ.get("KOOPA", None)
-# get parameters from the command line
-OUTPUTDIR = os.environ.get("OUTPUTDIR", config['outputdir'])
-MG = os.environ.get("MG", config['raws']['Metagenomics']).split()
-MT = os.environ.get("MT", config['raws']['Metatranscriptomics']).split()
-SAMPLE = os.environ.get("SAMPLE", config['sample'])
-DBPATH = os.environ.get("DBPATH", config['db_path'])
-if not os.path.exists(DBPATH):
-    os.makedirs(DBPATH)
-# Get general parameters
-THREADS = os.environ.get("THREADS", config['threads'])
-MEMTOTAL = os.environ.get("MEMTOTAL", config['memory_total_gb'])
-MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
-# temporary directory will be stored inside the OUTPUTDIR directory
-# unless a absolute path is set
-TMPDIR = os.environ.get("TMPDIR", config['tmp_dir'])
-if not os.path.isabs(TMPDIR):
-    TMPDIR = os.path.join(OUTPUTDIR, TMPDIR)
 def prepare_environment(stepname):
@@ -83,7 +23,7 @@ def prepare_environment(stepname):
    return out, os.path.join(out, '%s.log' % stepname)
-# INCLUDES RULES
+# INCLUDES PROCESSING RULES
 include:
    "rules/Util.rules"
 include:
@@ -94,21 +34,10 @@ include:
    "rules/Analysis/master.rules"
-# locate source directory and name scripts
-src = lambda p: os.path.join(SRCDIR, p)
 rule ALL:
    input:
        preprocessing_output_files(),
        assembly_output_files(),
        analysis_output_files()
    shell:
        "echo 'DONE'"
-rule MODULE_LOAD_TEST:
-    shell:
-        """
-        IMPPRL="{config[preload][test]}"; if [[ -n $IMPPRL ]]; then $IMPPRL; fi
-        """
--- a/config
+++ b/config
+import os
+import shutil
+import gzip
+import json
+import bz2
+from copy import deepcopy
+import subprocess
+def dict_merge(a, b):
+    """
+    Deep merge 2 dicts together
+    """
+    if not isinstance(b, dict):
+        return b
+    result = deepcopy(a)
+    for k, v in b.items():
+        if k in result and isinstance(result[k], dict):
+            result[k] = dict_merge(result[k], v)
+        else:
+            result[k] = deepcopy(v)
+    return result
+# default configuration file
+configfile:
+    "src/config.imp.json"
+# default executable for snakmake
+shell.executable("bash")
+# custom configuration file
+CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "conf/userconfig.imp.json")
+# merge 2 configurations files together
+if os.path.exists(CUSTOM_CONFIG_PATH):
+    with open(CUSTOM_CONFIG_PATH, 'r') as rhandle:
+        data = json.load(rhandle)
+        config = dict_merge(config, data)
+# some parameters
+SRCDIR = os.environ.get("SRCDIR", config['imp_src'])
+# get parameters from the command line
+OUTPUTDIR = os.environ.get("OUTPUTDIR", config['outputdir'])
+MG = os.environ.get("MG", config['raws']['Metagenomics']).split()
+MT = os.environ.get("MT", config['raws']['Metatranscriptomics']).split()
+SAMPLE = os.environ.get("SAMPLE", config['sample'])
+DBPATH = os.environ.get("DBPATH", config['db_path'])
+if not os.path.exists(DBPATH):
+    os.makedirs(DBPATH)
+# Get general parameters
+THREADS = os.environ.get("THREADS", config['threads'])
+MEMTOTAL = os.environ.get("MEMTOTAL", config['memory_total_gb'])
+MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
+# temporary directory will be stored inside the OUTPUTDIR directory
+# unless a absolute path is set
+TMPDIR = os.environ.get("TMPDIR", config['tmp_dir'])
+if not os.path.isabs(TMPDIR):
+    TMPDIR = os.path.join(OUTPUTDIR, TMPDIR)
+if not os.path.exists(TMPDIR):
+    os.makedirs(TMPDIR)
--- a/init.rule
+++ b/init.rule
+include:
+    "config"
+rule ALL:
+    input:
+        expand("{path}/{filter}.{ext}", path=DBPATH + "/human", filter=config["human_filtering"]["filter"], ext=['fa']),
+        expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna"),
+        expand("{path}/{db}", path=DBPATH, db=config["prokka"]["databases"]),
+        "%s/adapters/adapters.done" % DBPATH,
+        expand(
+            "{path}/idx/{files}.{ext}",
+            files=config["sortmerna"]["files"],
+            path=DBPATH + "/sortmerna",
+            ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats'])
+rule _DOWNLOAD_HUMAN_DB:
+    output:
+        expand("{path}/{filter}.{ext}", path=DBPATH + "/human", filter=config["human_filtering"]["filter"], ext=['fa'])
+    params:
+        filter = config["human_filtering"]["filter"], outdir = DBPATH + "/human"
+    shell:
+        """
+        TMPD=$(mktemp -d -t --tmpdir={TMPDIR} "XXXXXX")
+        wget {config[human_filtering][url]} --no-check-certificate -O $TMPD/{params.filter}.fa.gz
+        gunzip $TMPD/{params.filter}.fa.gz
+        mkdir -p {params.outdir}
+        mv $TMPD/{params.filter}.fa {params.outdir}
+        rm -rf $TMPD
+        """
+print(TMPDIR)
+rule _DOWNLOAD_SORTMERNA_DATABASES:
+    output:
+        expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna")
+    shell:
+        """
+        TMPD=$(mktemp -d -t --tmpdir={tmp} "XXXXXX")
+        wget {pkg_url} --no-check-certificate -O $TMPD/sortmerna.tgz
+        tar -xzf  $TMPD/sortmerna.tgz --strip-components=1 -C $TMPD
+        mkdir -p {path}
+        mv $TMPD/rRNA_databases/*.fasta {path}/.
+        rm -rf $TMPD
+        """.format(
+            pkg_url=config["sortmerna"]["pkg_url"],
+            path=DBPATH + "/sortmerna",
+            tmp=TMPDIR
+        )
+rule _DOWNLOAD_PROKKA_DATABASES:
+    output:
+        expand("{path}/{db}", path=DBPATH, db=config["prokka"]["databases"])
+    shell:
+        """
+        ### prokka by default will look databases where is located the binary.
+        ### we have to softlink to put the binary somewhere and the databases somewhere else.
+        if [[ "{DBPATH}" = /* ]]
+        then
+            PP={DBPATH};
+        else
+            PP=$PWD/{DBPATH};
+        fi
+        cd $(dirname $(which prokka))/.. && ln -s $PP db
+        echo "Softlinking $(dirname $(which prokka))/../db to $PP"
+        TMPDIR=$(mktemp -d -t "XXXXXX")
+        wget {config[prokka][pkg_url]} --no-check-certificate -O $TMPDIR/prokka.tgz
+        tar -xzf $TMPDIR/prokka.tgz --strip-components=1 -C $TMPDIR
+        mkdir -p {DBPATH}
+        cp -r $TMPDIR/db/* {DBPATH}/.
+        rm -rf $TMPDIR
+        prokka --setupdb
+        """
+rule INDEX_SORTMERNA_DB:
+    input:
+        expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna")
+    output:
+        expand(
+            "{path}/idx/{files}.{ext}",
+            files=config["sortmerna"]["files"],
+            path=DBPATH + "/sortmerna",
+            ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats'])
+    run:
+        fastaindexed = expand(
+            "{path}/idx/{files}",
+            files=config["sortmerna"]["files"],
+            path=DBPATH + "/sortmerna")
+        ref = ':'.join('%s,%s' % (a, b) for a, b in zip(input, fastaindexed))
+        shell("mkdir -p {DBPATH}/sortmerna")
+        shell("indexdb_rna --ref {ref}")
+rule INDEX_FASTA_FILE:
+    input:
+        "{fasta}"
+    output:
+        "{fasta}.amb",
+        "{fasta}.bwt",
+        "{fasta}.pac",
+        "{fasta}.sa",
+        "{fasta}.ann"
+    shell:
+        """
+        bwa index {wildcards.fasta} > {log} 2>&1
+        """
+rule _DOWNLOAD_TRIMMOMATIC_ADAPTERS:
+    output:
+        "{DBPATH}/adapters/adapters.done"
+    shell:
+        """
+        wget --no-check-certificate {config[trimmomatic][pkg_url]} -O Trimmomatic-Src-0.32.zip
+        unzip Trimmomatic-Src-0.32.zip
+        cp -r trimmomatic-0.32/adapters {DBPATH}
+        rm Trimmomatic-Src-0.32.zip && rm -rf trimmomatic-0.32
+        touch {output}
+        """
--- a/src/config.imp.json
+++ b/src/config.imp.json
@@ -10,9 +10,10 @@
    },
    "sample": "test",
    "outputdir": "/output",
-    "db_path": "db",
+    "db_path": "/databases",
    "preprocessing_filtering": true,
    "trimmomatic": {
+        "pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip",
        "adapter": "TruSeq3",
        "leading": 20,
        "minlen": 40,