start refactoring

c2eedb81 · Yohan Jarosz · c540d360 · c2eedb81 · c2eedb81 · c2eedb81
Commit c2eedb81 authored 9 years ago by Yohan Jarosz
--- a/conf/userconf.imp.json
+++ b/conf/userconf.imp.json
+{
+    "human_filtering": {
+        "filter": "hg21",
+        "url": "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/chromosomes/chr21.fa.gz"
+    },
+    "preprocessing_filtering": true,
+    "db_path": "/Users/yohan.jarosz/SandBox/databases",
+    "outputdir": "/Users/yohan.jarosz/SandBox/imp_output"
+}
--- a/rules/config
+++ b/rules/config
@@ -6,7 +6,6 @@ import bz2
 from copy import deepcopy
 import subprocess

-
 def dict_merge(a, b):
    """
    Deep merge 2 dicts together
@@ -21,15 +20,17 @@ def dict_merge(a, b):
            result[k] = deepcopy(v)
    return result

+
 # default configuration file
 configfile:
-    "src/config.imp.json"
+    "../../src/config.imp.json"

 # default executable for snakmake
 shell.executable("bash")

 # custom configuration file
-CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "conf/userconfig.imp.json")
+CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "../../conf/userconf.imp.json")
+
 # merge 2 configurations files together
 if os.path.exists(CUSTOM_CONFIG_PATH):
    with open(CUSTOM_CONFIG_PATH, 'r') as rhandle:

--- a/rules/data_input.rules
+++ b/rules/data_input.rules
+import os, shutil
+
+def prepare_input_files(input, type):
+    """
+    Prepare file names from input.
+    Will copy and/or ungzip paired input files into output directory named as '{type}.R1.fq' and '{type}.R2.fq'.
+    If user provides R1, R2 and SE file, we rname the file as there were already preprocessed.
+    input: input names from the user
+    type: MG or MT
+    """
+    if len(input) < 2:
+        raise OSError("//[IMP] %s files must be at least two paired files or two paired file and single end." % type)
+    if len(input) >= 2:
+        r1 = input[0]
+        r2 = input[1]
+        _, fname1 = os.path.split(r1)
+        _, fname2 = os.path.split(r2)
+        out1 = "{type}.r1.fq".format(type=type)
+        out2 = "{type}.r2.fq".format(type=type)
+        _process_file(fname1, r1, out1)
+        _process_file(fname2, r2, out2)
+
+    if len(input) == 3:
+        se = input[2]
+        _, fname3 = os.path.split(se)
+        out3 = "{type}.se.fq".format(type=type)
+        _process_file(fname3, se,out3)
+        # symlink to the preprocessing output directory
+        link1 = '{type}.r1.preprocessed.fq'.format(type=type)
+        link2 = '{type}.r2.preprocessed.fq'.format(type=type)
+        link3 = '{type}.se.preprocessed.fq'.format(type=type)
+        print('symlink',  out1, '=>', link1)
+        os.symlink(out1, link1)
+        print('symlink',  out2, '=>', link2)
+        os.symlink(out2, link2)
+        print('symlink',  out3, '=>', link3)
+        os.symlink(out3, link3)
+
+def _process_file(fname, inp, outfilename):
+    """
+    Write the input to the output. Handle raw, zip, or bzip input files.
+    """
+    print(inp, '=>', outfilename)
+    import bz2
+    # ungunzip
+    if os.path.splitext(fname)[-1] in ['.gz', '.gzip']:
+        with open(outfilename, 'wb') as whandle, gzip.open(inp, 'rb') as rhandle:
+            whandle.write(rhandle.read())
+    # unbzip2
+    elif os.path.splitext(fname)[-1] in ['.bz2', '.bzip2']:
+        dec = bz2.BZ2Decompressor()
+        with open(outfilename, 'wb') as whandle, open(inp, 'rb') as rhandle:
+            for data in iter(lambda: rhandle.read(CHUNK_SIZE), b''):
+                whandle.write(dec.decompress(data))
+    # copy
+    else:
+
+        shutil.copy(inp, outfilename)
+
+def datainput(wildcards):
+    for wildcard in wildcards:
+        if str(wildcard) in TYPES:
+            return data_input[wildcard]
+    return None
+
+rule PREPARE_DATA_INPUT:
+    input:
+        datainput
+    output:
+        "{type}.r1.fq",
+        "{type}.r2.fq"
+    params:
+        type='{wildcards.type}'
+    run:
+        prepare_input_files(input, wildcards.type)
--- a/rules/databases.rules
+++ b/rules/databases.rules
+workdir:
+    DBPATH
+
+rule trimmomatic_adapters:
+    output:
+        touch("{DBPATH}/adapters/adapters.done")
+    shell:
+        """
+        wget --no-check-certificate {config[trimmomatic][pkg_url]} -O Trimmomatic-Src-0.32.zip
+        unzip Trimmomatic-Src-0.32.zip
+        cp -r trimmomatic-0.32/adapters {DBPATH}
+        rm Trimmomatic-Src-0.32.zip && rm -rf trimmomatic-0.32
+        touch {output}
+        """
--- a/rules/trimming.rules
+++ b/rules/trimming.rules
+rule trimming:
+    input:
+        '{type}.r1.fq',
+        '{type}.r2.fq',
+        DBPATH + "/adapters/adapters.done"
+    output:
+        '{type}.r1.trimmed.fq',
+        '{type}.se1.trimmed.fq',
+        '{type}.r2.trimmed.fq',
+        '{type}.se2.trimmed.fq'
+    shell:
+        """
+        java -jar {config[trimmomatic][jarfile]} PE -threads {THREADS} {input[0]} {input[1]} {output} \
+        ILLUMINACLIP:{DBPATH}/adapters/{config[trimmomatic][adapter]}-PE.fa:{config[trimmomatic][seed_mismatch]}:{config[trimmomatic][palindrome_clip_threshold]}:{config[trimmomatic][simple_clip_threshold]} \
+        LEADING:{config[trimmomatic][leading]} \
+        TRAILING:{config[trimmomatic][trailing]} \
+        SLIDINGWINDOW:{config[trimmomatic][window_size]}:{config[trimmomatic][window_quality]} \
+        MINLEN:{config[trimmomatic][minlen]} \
+        MAXINFO:{config[trimmomatic][target_length]}:{config[trimmomatic][strictness]}
+        """
+
+
+rule cat_se_trimmed:
+    input:
+        '{type}.se1.trimmed.fq',
+        '{type}.se2.trimmed.fq'
+    output:
+        '{type}.se.trimmed.fq',
+    shell:
+        "cat {input[0]} {input[1]} > {output}"
--- a/workflows/integrative/Snakefile
+++ b/workflows/integrative/Snakefile
+include:
+    "../../rules/config"
+
+import os
+
+
+TYPES = ['mg', 'mt']
+
+dip="/Users/yohan.jarosz/Programmation/lcsb/IMP/TEST_DATA/"
+data_input={
+        'mg': [dip + 'MG.R1.5_percent.fq', dip + 'MG.R2.5_percent.fq'],
+        'mt': [dip + 'MT.R1.5_percent.fq', dip + 'MT.R2.5_percent.fq']
+}
+
+mt = [
+    dip + 'MT.R1.5_percent.fq',
+    dip + 'MT.R2.5_percent.fq'
+]
+
+mg = [
+    dip + 'MG.R1.5_percent.fq',
+    dip + 'MG.R2.5_percent.fq'
+]
+
+
+
+include:
+    "../../rules/databases.rules"
+
+workdir:
+    os.path.join(OUTPUTDIR, 'Preprocessing')
+
+include:
+    "../../rules/data_input.rules"
+include:
+    "../../rules/trimming.rules"
+
+# master command
+rule PREPROCESSING:
+    input:
+        expand(['{type}.r1.trimmed.fq','{type}.r2.trimmed.fq', '{type}.se.trimmed.fq'], type=TYPES)
+    output:
+        touch('preprocessing.done')