Skip to content
Snippets Groups Projects
Commit c2eedb81 authored by Yohan Jarosz's avatar Yohan Jarosz
Browse files

start refactoring

parent c540d360
No related branches found
No related tags found
1 merge request!1Single omics
{
"human_filtering": {
"filter": "hg21",
"url": "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/chromosomes/chr21.fa.gz"
},
"preprocessing_filtering": true,
"db_path": "/Users/yohan.jarosz/SandBox/databases",
"outputdir": "/Users/yohan.jarosz/SandBox/imp_output"
}
......@@ -6,7 +6,6 @@ import bz2
from copy import deepcopy
import subprocess
def dict_merge(a, b):
"""
Deep merge 2 dicts together
......@@ -21,15 +20,17 @@ def dict_merge(a, b):
result[k] = deepcopy(v)
return result
# default configuration file
configfile:
"src/config.imp.json"
"../../src/config.imp.json"
# default executable for snakmake
shell.executable("bash")
# custom configuration file
CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "conf/userconfig.imp.json")
CUSTOM_CONFIG_PATH = os.environ.get("CONFIGFILE", "../../conf/userconf.imp.json")
# merge 2 configurations files together
if os.path.exists(CUSTOM_CONFIG_PATH):
with open(CUSTOM_CONFIG_PATH, 'r') as rhandle:
......
import os, shutil
def prepare_input_files(input, type):
"""
Prepare file names from input.
Will copy and/or ungzip paired input files into output directory named as '{type}.R1.fq' and '{type}.R2.fq'.
If user provides R1, R2 and SE file, we rname the file as there were already preprocessed.
input: input names from the user
type: MG or MT
"""
if len(input) < 2:
raise OSError("//[IMP] %s files must be at least two paired files or two paired file and single end." % type)
if len(input) >= 2:
r1 = input[0]
r2 = input[1]
_, fname1 = os.path.split(r1)
_, fname2 = os.path.split(r2)
out1 = "{type}.r1.fq".format(type=type)
out2 = "{type}.r2.fq".format(type=type)
_process_file(fname1, r1, out1)
_process_file(fname2, r2, out2)
if len(input) == 3:
se = input[2]
_, fname3 = os.path.split(se)
out3 = "{type}.se.fq".format(type=type)
_process_file(fname3, se,out3)
# symlink to the preprocessing output directory
link1 = '{type}.r1.preprocessed.fq'.format(type=type)
link2 = '{type}.r2.preprocessed.fq'.format(type=type)
link3 = '{type}.se.preprocessed.fq'.format(type=type)
print('symlink', out1, '=>', link1)
os.symlink(out1, link1)
print('symlink', out2, '=>', link2)
os.symlink(out2, link2)
print('symlink', out3, '=>', link3)
os.symlink(out3, link3)
def _process_file(fname, inp, outfilename):
"""
Write the input to the output. Handle raw, zip, or bzip input files.
"""
print(inp, '=>', outfilename)
import bz2
# ungunzip
if os.path.splitext(fname)[-1] in ['.gz', '.gzip']:
with open(outfilename, 'wb') as whandle, gzip.open(inp, 'rb') as rhandle:
whandle.write(rhandle.read())
# unbzip2
elif os.path.splitext(fname)[-1] in ['.bz2', '.bzip2']:
dec = bz2.BZ2Decompressor()
with open(outfilename, 'wb') as whandle, open(inp, 'rb') as rhandle:
for data in iter(lambda: rhandle.read(CHUNK_SIZE), b''):
whandle.write(dec.decompress(data))
# copy
else:
shutil.copy(inp, outfilename)
def datainput(wildcards):
for wildcard in wildcards:
if str(wildcard) in TYPES:
return data_input[wildcard]
return None
rule PREPARE_DATA_INPUT:
input:
datainput
output:
"{type}.r1.fq",
"{type}.r2.fq"
params:
type='{wildcards.type}'
run:
prepare_input_files(input, wildcards.type)
workdir:
DBPATH
rule trimmomatic_adapters:
output:
touch("{DBPATH}/adapters/adapters.done")
shell:
"""
wget --no-check-certificate {config[trimmomatic][pkg_url]} -O Trimmomatic-Src-0.32.zip
unzip Trimmomatic-Src-0.32.zip
cp -r trimmomatic-0.32/adapters {DBPATH}
rm Trimmomatic-Src-0.32.zip && rm -rf trimmomatic-0.32
touch {output}
"""
rule trimming:
input:
'{type}.r1.fq',
'{type}.r2.fq',
DBPATH + "/adapters/adapters.done"
output:
'{type}.r1.trimmed.fq',
'{type}.se1.trimmed.fq',
'{type}.r2.trimmed.fq',
'{type}.se2.trimmed.fq'
shell:
"""
java -jar {config[trimmomatic][jarfile]} PE -threads {THREADS} {input[0]} {input[1]} {output} \
ILLUMINACLIP:{DBPATH}/adapters/{config[trimmomatic][adapter]}-PE.fa:{config[trimmomatic][seed_mismatch]}:{config[trimmomatic][palindrome_clip_threshold]}:{config[trimmomatic][simple_clip_threshold]} \
LEADING:{config[trimmomatic][leading]} \
TRAILING:{config[trimmomatic][trailing]} \
SLIDINGWINDOW:{config[trimmomatic][window_size]}:{config[trimmomatic][window_quality]} \
MINLEN:{config[trimmomatic][minlen]} \
MAXINFO:{config[trimmomatic][target_length]}:{config[trimmomatic][strictness]}
"""
rule cat_se_trimmed:
input:
'{type}.se1.trimmed.fq',
'{type}.se2.trimmed.fq'
output:
'{type}.se.trimmed.fq',
shell:
"cat {input[0]} {input[1]} > {output}"
include:
"../../rules/config"
import os
TYPES = ['mg', 'mt']
dip="/Users/yohan.jarosz/Programmation/lcsb/IMP/TEST_DATA/"
data_input={
'mg': [dip + 'MG.R1.5_percent.fq', dip + 'MG.R2.5_percent.fq'],
'mt': [dip + 'MT.R1.5_percent.fq', dip + 'MT.R2.5_percent.fq']
}
mt = [
dip + 'MT.R1.5_percent.fq',
dip + 'MT.R2.5_percent.fq'
]
mg = [
dip + 'MG.R1.5_percent.fq',
dip + 'MG.R2.5_percent.fq'
]
include:
"../../rules/databases.rules"
workdir:
os.path.join(OUTPUTDIR, 'Preprocessing')
include:
"../../rules/data_input.rules"
include:
"../../rules/trimming.rules"
# master command
rule PREPROCESSING:
input:
expand(['{type}.r1.trimmed.fq','{type}.r2.trimmed.fq', '{type}.se.trimmed.fq'], type=TYPES)
output:
touch('preprocessing.done')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment