From 8608c194e10e74bbf64dcdf5503f2f7f33e7d669 Mon Sep 17 00:00:00 2001
From: Yohan Jarosz <yohanjarosz@yahoo.fr>
Date: Fri, 17 Jul 2015 14:16:50 +0200
Subject: [PATCH] extract initialization from core workflow

---
 IMP                 |  51 +++++++++++++++++--
 Snakefile           |  77 ++---------------------------
 config              |  63 +++++++++++++++++++++++
 init.rule           | 118 ++++++++++++++++++++++++++++++++++++++++++++
 src/config.imp.json |   3 +-
 5 files changed, 233 insertions(+), 79 deletions(-)
 create mode 100644 config
 create mode 100644 init.rule

diff --git a/IMP b/IMP
index 8076863..9b6ae92 100755
--- a/IMP
+++ b/IMP
@@ -5,6 +5,9 @@ import subprocess
 import os
 import json
 import shlex
+import snakemake
+from copy import deepcopy
+import tempfile
 
 __doc__ = """Integrated Metaomic Pipeline.
  ____  __  __  ____
@@ -14,16 +17,19 @@ __doc__ = """Integrated Metaomic Pipeline.
 
 Usage:
   IMP -m MG1 -m MG2 -t MT1 -t MT2 -o OUTPUT [--enter] [--norm] [-n CONTAINER] [-e ENV] ... [COMMANDS ...]
+  IMP --init [-d DBPATH]
   IMP (-h | --help)
   IMP --version
 
 Options:
   -e ENV            Environment variable to pass to the container
   --enter           Enter the container
+  --init            Initialize IMP databases (Take a while)
   --norm            Don't delete the container after use. Useful for debugging.
   -h --help         Show this help and exit
   -m MG             Path to the metagenomics paired files (must be 2 files).
   -t MT             Path to the metatranscriptomic paired files (2 files).
+  -d DBPATH         Path to the databases [default: db]
   -n CONTAINER      Name of the container. Useful when you want to run a previous version of IMP.
   -o OUTPUT         Path to the output directory
 """
@@ -35,6 +41,21 @@ def get_version():
     )
 
 
+def dict_merge(a, b):
+    """
+    Deep merge 2 dicts together
+    """
+    if not isinstance(b, dict):
+        return b
+    result = deepcopy(a)
+    for k, v in b.items():
+        if k in result and isinstance(result[k], dict):
+            result[k] = dict_merge(result[k], v)
+        else:
+            result[k] = deepcopy(v)
+    return result
+
+
 def yes_or_no(question):
     reply = str(input(question + ' (y/n): ')).lower().strip()
     if reply[0] == 'y':
@@ -44,11 +65,23 @@ def yes_or_no(question):
     else:
         return yes_or_no("Please enter ")
 
-if __name__ == '__main__':
-    args = docopt(__doc__, version=get_version(), options_first=True)
 
+def init(args):
     CURRENT_PATH = Path(__file__).parent.abspath()
+    # start docker container to index files and setup prokka
+    container_name = args['-n'] is not None and args['-n'] or 'imp:latest'
+    db_path = Path(args['-d']).abspath()
+    cmd = [
+        'docker', 'run', '--rm',
+        '-v %s:/home/imp/integrated-metaomic-pipeline' % CURRENT_PATH,
+        '-v %s:/databases' % db_path, container_name, 'snakemake -s %s/' % CURRENT_PATH
+    ]
+    print("Executing", '"', ' '.join(cmd), '"')
+    subprocess.call(cmd)
 
+
+def run(args):
+    CURRENT_PATH = Path(__file__).parent.abspath()
     # find common path
     mg_data = [Path(p).abspath() for p in args['-m']]
     mt_data = [Path(p).abspath() for p in args['-t']]
@@ -69,12 +102,13 @@ if __name__ == '__main__':
         else:
             exit(0)
     container_name = args['-n'] is not None and args['-n'] or 'imp:latest'
-
+    db_path = Path(args['-d']).abspath()
     # configure IMP mount point to the docker container
     mount_points = [
         '-v %s:/data' % common_path,
         '-v %s:/home/imp/integrated-metaomic-pipeline' % CURRENT_PATH,
-        '-v %s:/output' % output
+        '-v %s:/output' % output,
+        '-v %s:/databases' % db_path,
     ]
 
     # environement variables (add MG and MT data)
@@ -107,3 +141,12 @@ if __name__ == '__main__':
     cmd = shlex.split(' '.join(cmd))
     print("Executing", '"', ' '.join(cmd), '"')
     subprocess.call(cmd)
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__, version=get_version(), options_first=True)
+
+    if args['--init']:
+        init(args)
+    else:
+        run(args)
diff --git a/Snakefile b/Snakefile
index 2de4da2..2575d8a 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,65 +1,5 @@
-import os
-import shutil
-import gzip
-import json
-import bz2
-from copy import deepcopy
-import subprocess
-
-
-def dict_merge(a, b):
-    """
-    Deep merge 2 dicts together
-    """
-    if not isinstance(b, dict):
-        return b
-    result = deepcopy(a)
-    for k, v in b.items():
-        if k in result and isinstance(result[k], dict):
-            result[k] = dict_merge(result[k], v)
-        else:
-            result[k] = deepcopy(v)
-    return result
-
-# default configuration file
-configfile:
-    "src/config.imp.json"
-
-# default executable for snakmake
-shell.executable("bash")
-
-# custom configuration file
-CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "conf/userconfig.imp.json")
-# merge 2 configurations files together
-if os.path.exists(CUSTOM_CONFIG_PATH):
-    with open(CUSTOM_CONFIG_PATH, 'r') as rhandle:
-        data = json.load(rhandle)
-        config = dict_merge(config, data)
-
-
-# some parameters
-SRCDIR = os.environ.get("SRCDIR", config['imp_src'])
-
-KOOPA = os.environ.get("KOOPA", None)
-# get parameters from the command line
-OUTPUTDIR = os.environ.get("OUTPUTDIR", config['outputdir'])
-MG = os.environ.get("MG", config['raws']['Metagenomics']).split()
-MT = os.environ.get("MT", config['raws']['Metatranscriptomics']).split()
-SAMPLE = os.environ.get("SAMPLE", config['sample'])
-DBPATH = os.environ.get("DBPATH", config['db_path'])
-if not os.path.exists(DBPATH):
-    os.makedirs(DBPATH)
-
-# Get general parameters
-THREADS = os.environ.get("THREADS", config['threads'])
-MEMTOTAL = os.environ.get("MEMTOTAL", config['memory_total_gb'])
-MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
-
-# temporary directory will be stored inside the OUTPUTDIR directory
-# unless a absolute path is set
-TMPDIR = os.environ.get("TMPDIR", config['tmp_dir'])
-if not os.path.isabs(TMPDIR):
-    TMPDIR = os.path.join(OUTPUTDIR, TMPDIR)
+include:
+    "config"
 
 
 def prepare_environment(stepname):
@@ -83,7 +23,7 @@ def prepare_environment(stepname):
     return out, os.path.join(out, '%s.log' % stepname)
 
 
-# INCLUDES RULES
+# INCLUDES PROCESSING RULES
 include:
     "rules/Util.rules"
 include:
@@ -94,21 +34,10 @@ include:
     "rules/Analysis/master.rules"
 
 
-# locate source directory and name scripts
-src = lambda p: os.path.join(SRCDIR, p)
-
-
 rule ALL:
     input:
         preprocessing_output_files(),
         assembly_output_files(),
         analysis_output_files()
-
     shell:
         "echo 'DONE'"
-
-rule MODULE_LOAD_TEST:
-    shell:
-        """
-        IMPPRL="{config[preload][test]}"; if [[ -n $IMPPRL ]]; then $IMPPRL; fi
-        """
diff --git a/config b/config
new file mode 100644
index 0000000..f469cbd
--- /dev/null
+++ b/config
@@ -0,0 +1,63 @@
+import os
+import shutil
+import gzip
+import json
+import bz2
+from copy import deepcopy
+import subprocess
+
+
+def dict_merge(a, b):
+    """
+    Deep merge 2 dicts together
+    """
+    if not isinstance(b, dict):
+        return b
+    result = deepcopy(a)
+    for k, v in b.items():
+        if k in result and isinstance(result[k], dict):
+            result[k] = dict_merge(result[k], v)
+        else:
+            result[k] = deepcopy(v)
+    return result
+
+# default configuration file
+configfile:
+    "src/config.imp.json"
+
+# default executable for snakmake
+shell.executable("bash")
+
+# custom configuration file
+CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "conf/userconfig.imp.json")
+# merge 2 configurations files together
+if os.path.exists(CUSTOM_CONFIG_PATH):
+    with open(CUSTOM_CONFIG_PATH, 'r') as rhandle:
+        data = json.load(rhandle)
+        config = dict_merge(config, data)
+
+
+# some parameters
+SRCDIR = os.environ.get("SRCDIR", config['imp_src'])
+
+# get parameters from the command line
+OUTPUTDIR = os.environ.get("OUTPUTDIR", config['outputdir'])
+MG = os.environ.get("MG", config['raws']['Metagenomics']).split()
+MT = os.environ.get("MT", config['raws']['Metatranscriptomics']).split()
+SAMPLE = os.environ.get("SAMPLE", config['sample'])
+DBPATH = os.environ.get("DBPATH", config['db_path'])
+if not os.path.exists(DBPATH):
+    os.makedirs(DBPATH)
+
+# Get general parameters
+THREADS = os.environ.get("THREADS", config['threads'])
+MEMTOTAL = os.environ.get("MEMTOTAL", config['memory_total_gb'])
+MEMCORE = os.environ.get("MEMCORE", config['memory_per_core_gb'])
+
+# temporary directory will be stored inside the OUTPUTDIR directory
+# unless a absolute path is set
+TMPDIR = os.environ.get("TMPDIR", config['tmp_dir'])
+if not os.path.isabs(TMPDIR):
+    TMPDIR = os.path.join(OUTPUTDIR, TMPDIR)
+if not os.path.exists(TMPDIR):
+    os.makedirs(TMPDIR)
diff --git a/init.rule b/init.rule
new file mode 100644
index 0000000..3ba8ac2
--- /dev/null
+++ b/init.rule
@@ -0,0 +1,118 @@
+include:
+    "config"
+
+rule ALL:
+    input:
+        expand("{path}/{filter}.{ext}", path=DBPATH + "/human", filter=config["human_filtering"]["filter"], ext=['fa']),
+        expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna"),
+        expand("{path}/{db}", path=DBPATH, db=config["prokka"]["databases"]),
+        "%s/adapters/adapters.done" % DBPATH,
+        expand(
+            "{path}/idx/{files}.{ext}",
+            files=config["sortmerna"]["files"],
+            path=DBPATH + "/sortmerna",
+            ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats'])
+
+rule _DOWNLOAD_HUMAN_DB:
+    output:
+        expand("{path}/{filter}.{ext}", path=DBPATH + "/human", filter=config["human_filtering"]["filter"], ext=['fa'])
+    params:
+        filter = config["human_filtering"]["filter"], outdir = DBPATH + "/human"
+    shell:
+        """
+        TMPD=$(mktemp -d -t --tmpdir={TMPDIR} "XXXXXX")
+        wget {config[human_filtering][url]} --no-check-certificate -O $TMPD/{params.filter}.fa.gz
+        gunzip $TMPD/{params.filter}.fa.gz
+        mkdir -p {params.outdir}
+        mv $TMPD/{params.filter}.fa {params.outdir}
+        rm -rf $TMPD
+        """
+print(TMPDIR)
+
+rule _DOWNLOAD_SORTMERNA_DATABASES:
+    output:
+        expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna")
+    shell:
+        """
+        TMPD=$(mktemp -d -t --tmpdir={tmp} "XXXXXX")
+        wget {pkg_url} --no-check-certificate -O $TMPD/sortmerna.tgz
+        tar -xzf  $TMPD/sortmerna.tgz --strip-components=1 -C $TMPD
+        mkdir -p {path}
+        mv $TMPD/rRNA_databases/*.fasta {path}/.
+        rm -rf $TMPD
+        """.format(
+            pkg_url=config["sortmerna"]["pkg_url"],
+            path=DBPATH + "/sortmerna",
+            tmp=TMPDIR
+        )
+
+rule _DOWNLOAD_PROKKA_DATABASES:
+    output:
+        expand("{path}/{db}", path=DBPATH, db=config["prokka"]["databases"])
+    shell:
+        """
+        ### prokka by default will look databases where is located the binary.
+        ### we have to softlink to put the binary somewhere and the databases somewhere else.
+        if [[ "{DBPATH}" = /* ]]
+        then
+            PP={DBPATH};
+        else
+            PP=$PWD/{DBPATH};
+        fi
+        cd $(dirname $(which prokka))/.. && ln -s $PP db
+        echo "Softlinking $(dirname $(which prokka))/../db to $PP"
+        TMPDIR=$(mktemp -d -t "XXXXXX")
+        wget {config[prokka][pkg_url]} --no-check-certificate -O $TMPDIR/prokka.tgz
+        tar -xzf $TMPDIR/prokka.tgz --strip-components=1 -C $TMPDIR
+        mkdir -p {DBPATH}
+        cp -r $TMPDIR/db/* {DBPATH}/.
+        rm -rf $TMPDIR
+        prokka --setupdb
+        """
+
+
+rule INDEX_SORTMERNA_DB:
+    input:
+        expand("{path}/{files}.fasta", files=config["sortmerna"]["files"], path=DBPATH + "/sortmerna")
+    output:
+        expand(
+            "{path}/idx/{files}.{ext}",
+            files=config["sortmerna"]["files"],
+            path=DBPATH + "/sortmerna",
+            ext=['bursttrie_0.dat', 'kmer_0.dat', 'pos_0.dat', 'stats'])
+    run:
+        fastaindexed = expand(
+            "{path}/idx/{files}",
+            files=config["sortmerna"]["files"],
+            path=DBPATH + "/sortmerna")
+        ref = ':'.join('%s,%s' % (a, b) for a, b in zip(input, fastaindexed))
+        shell("mkdir -p {DBPATH}/sortmerna")
+        shell("indexdb_rna --ref {ref}")
+
+
+rule INDEX_FASTA_FILE:
+    input:
+        "{fasta}"
+    output:
+        "{fasta}.amb",
+        "{fasta}.bwt",
+        "{fasta}.pac",
+        "{fasta}.sa",
+        "{fasta}.ann"
+    shell:
+        """
+        bwa index {wildcards.fasta} > {log} 2>&1
+        """
+
+
+rule _DOWNLOAD_TRIMMOMATIC_ADAPTERS:
+    output:
+        "{DBPATH}/adapters/adapters.done"
+    shell:
+        """
+        wget --no-check-certificate {config[trimmomatic][pkg_url]} -O Trimmomatic-Src-0.32.zip
+        unzip Trimmomatic-Src-0.32.zip
+        cp -r trimmomatic-0.32/adapters {DBPATH}
+        rm Trimmomatic-Src-0.32.zip && rm -rf trimmomatic-0.32
+        touch {output}
+        """
diff --git a/src/config.imp.json b/src/config.imp.json
index 5a8ba78..a1a09ce 100644
--- a/src/config.imp.json
+++ b/src/config.imp.json
@@ -10,9 +10,10 @@
     },
     "sample": "test",
     "outputdir": "/output",
-    "db_path": "db",
+    "db_path": "/databases",
     "preprocessing_filtering": true,
     "trimmomatic": {
+        "pkg_url": "https://webdav-r3lab.uni.lu/public/R3lab/IMP/Trimmomatic-Src-0.32.zip",
         "adapter": "TruSeq3",
         "leading": 20,
         "minlen": 40,
-- 
GitLab