Skip to content
Snippets Groups Projects
Commit 17bb9496 authored by Susheel Busi's avatar Susheel Busi
Browse files

added rules for diamond analyses

parent 802dba2d
No related branches found
No related tags found
2 merge requests!78Add zymo workflow,!77Workflow: Zymo - Comparison to Reference Genomes
# Main requirements
work_dir: "/scratch/users/sbusi/zymo" work_dir: "/scratch/users/sbusi/zymo"
results_dir: "results" results_dir: "results"
data_dir: "/scratch/users/sbusi/zymo/Zymo/results"
# Assembly
# List of assemblers for different read types: assembler names MUST be UNIQUE
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye", "canu"]
hy: ["metaspadeshybrid", "operamsmegahit", "operamsmetaspades"]
# Additional params
single_fast5_dir: "data_single_fast5" single_fast5_dir: "data_single_fast5"
multi_fast5_dir: "data_multi_fast5" multi_fast5_dir: "data_multi_fast5"
ref_genomes_dir: "data_ref_genomes" ref_genomes_dir: "data_ref_genomes"
# Annotation tools
prodigal:
threads: 12
diamond: diamond:
threads: 20 threads: 20
\ No newline at end of file
...@@ -36,7 +36,12 @@ ONTP_CLUSTER="{cluster.call} -t {cluster.runtime} -n {cluster.threads} -p {clust ...@@ -36,7 +36,12 @@ ONTP_CLUSTER="{cluster.call} -t {cluster.runtime} -n {cluster.threads} -p {clust
# activate the env # activate the env
conda activate ${ONTP_ENV} conda activate ${ONTP_ENV}
# run the pipeline # unlock the pipeline
snakemake -s ${ONTP_SMK} -rp --jobs 10 --local-cores 1 \ snakemake -s ${ONTP_SMK} -rp --jobs 10 --local-cores 1 \
--configfile ${ONTP_CONFIG} --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
--cluster-config ${ONTP_SLURM} --cluster "${ONTP_CLUSTER}" --unlock
# run the pipeline
snakemake -s ${ONTP_SMK} -rpn --jobs 10 --local-cores 1 \
--configfile ${ONTP_CONFIG} --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \ --configfile ${ONTP_CONFIG} --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
--cluster-config ${ONTP_SLURM} --cluster "${ONTP_CLUSTER}" --cluster-config ${ONTP_SLURM} --cluster "${ONTP_CLUSTER}"
...@@ -17,12 +17,17 @@ concatenate: ...@@ -17,12 +17,17 @@ concatenate:
quality: "normal" quality: "normal"
runtime: "0-00:05:00" runtime: "0-00:05:00"
annotation_prodigal: zymo_prodigal:
partition: "batch" partition: "batch"
qos: "normal" qos: "normal"
runtime: "00-2:00:00" runtime: "00-1:00:00"
annotation_diamond: diamond_db:
partition: "batch"
quality: "normal"
runtime: "00-00:30:00"
analysis_diamond:
partition: "batch" partition: "batch"
quality: "normal" quality: "normal"
runtime: "00-2:00:00" runtime: "00-2:00:00"
\ No newline at end of file
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# Python modules # Python modules
################################################## ##################################################
import os import os
import re
############################## ##############################
# CONFIG # CONFIG
...@@ -29,6 +30,26 @@ SINGLE_FAST5_DIR = os.path.abspath(config["single_fast5_dir"]) ...@@ -29,6 +30,26 @@ SINGLE_FAST5_DIR = os.path.abspath(config["single_fast5_dir"])
MULTI_FAST5_DIR = os.path.abspath(config["multi_fast5_dir"]) MULTI_FAST5_DIR = os.path.abspath(config["multi_fast5_dir"])
REF_GENOMES_DIR = os.path.abspath(config["ref_genomes_dir"]) REF_GENOMES_DIR = os.path.abspath(config["ref_genomes_dir"])
RESULTS_DIR = config["results_dir"] RESULTS_DIR = config["results_dir"]
DATA_DIR = config["data_dir"]
##################################################
# FUNCTIONS
##################################################
def assembler_pairs(assemblers):
"""
Create all possible combinations of two assemblers
"""
from itertools import combinations
return list(combinations(assemblers, 2))
##################################################
# ADDITIONAL PARAMETERS
##################################################
READ_TYPES = list(config["assemblers"].keys()) # list of read types
ASSEMBLERS = [y for x in config["assemblers"].values() for y in x] # list of all assemblers
READ_ASSEMBLERS = [y for x in [[(k, vv) for vv in v] for k, v in config["assemblers"].items()] for y in x] # list of (read type, assembler)
READ_ASSEMBLER_PAIRS = assembler_pairs(READ_ASSEMBLERS)
################################################## ##################################################
# RULES # RULES
################################################## ##################################################
...@@ -36,22 +57,20 @@ RESULTS_DIR = config["results_dir"] ...@@ -36,22 +57,20 @@ RESULTS_DIR = config["results_dir"]
rule all: rule all:
input: input:
REF_GENOMES_DIR, REF_GENOMES_DIR,
expand(os.path.join(RESULTS_DIR,"reference/mock_genomes.fasta")) expand(os.path.join(RESULTS_DIR,"reference/mock_genomes.fasta")),
expand(os.path.join(RESULTS_DIR,"annotation/zymo_prodigal/proteins.faa")),
expand(os.path.join(RESULTS_DIR,"annotation/diamond/zymo.dmnd")),
expand(os.path.join(RESULTS_DIR, "analysis/diamond/{rtype_tool}.{ext}"),
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS],
ext=["daa", "tsv"])
rule download_genomes: rule download_genomes:
# input:
# SINGLE_FAST5_DIR
output: output:
directory(REF_GENOMES_DIR) directory(REF_GENOMES_DIR)
log: log:
out="logs/ref_genomes.out.log", out="logs/ref_genomes.out.log",
err="logs/ref_genomes.err.log" err="logs/ref_genomes.err.log"
# threads:
# config["ont_fast5_api"]["single_to_multi_fast5"]["threads"]
# params:
# batch_size=config["ont_fast5_api"]["single_to_multi_fast5"]["batch_size"]
# conda:
# os.path.join(ENV_DIR, "ont_fast5_api.yaml")
shell: shell:
"(date && " "(date && "
"wget https://github.com/al-mcintyre/mCaller_analysis_scripts/raw/master/assemblies/bsubtilis_pb.fasta -P {output} && " "wget https://github.com/al-mcintyre/mCaller_analysis_scripts/raw/master/assemblies/bsubtilis_pb.fasta -P {output} && "
...@@ -79,4 +98,65 @@ rule concatenate: ...@@ -79,4 +98,65 @@ rule concatenate:
shell: shell:
"(date && " "(date && "
"cat {input}/*.fasta > {output} && " "cat {input}/*.fasta > {output} && "
"date) 2> {log.err} > {log.out}" "date) 2> {log.err} > {log.out}"
\ No newline at end of file
rule zymo_prodigal:
input:
rules.concatenate.output
output:
os.path.join(RESULTS_DIR,"annotation/zymo_prodigal/proteins.faa")
log:
out="logs/zymo_prodigal.out.log",
err="logs/zymo_prodigal.err.log"
threads:
config["prodigal"]["threads"]
conda:
os.path.join(ENV_DIR, "prodigal.yaml")
shell:
"(date && "
"prodigal -a {output} -p meta -i {input} && "
"date) 2> {log.err} > {log.out}"
rule diamond_db:
input:
rules.zymo_prodigal.output
output:
os.path.join(RESULTS_DIR,"annotation/diamond/zymo.dmnd")
log:
out="logs/diamond_db.out.log",
err="logs/diamond_db.err.log"
threads:
config["diamond"]["threads"]
conda:
os.path.join(ENV_DIR, "diamond.yaml")
shell:
"(date && "
"diamond makedb --in {input} -d $(echo {output} | sed 's/.dmnd$//') && "
"date) 2> {log.err} > {log.out}"
rule analysis_diamond:
input:
faa=os.path.join(DATA_DIR, "annotation/prodigal/{rtype}/{tool}/proteins.faa"),
db=rules.diamond_db.output
output:
daa=os.path.join(RESULTS_DIR, "analysis/diamond/{rtype}_{tool}.daa"),
tsv=os.path.join(RESULTS_DIR, "analysis/diamond/{rtype}_{tool}.tsv")
log:
"logs/analysis.diamond.{rtype}.{tool}.log",
wildcard_constraints:
rtype="|".join(READ_TYPES),
tool="|".join(ASSEMBLERS)
threads:
config["diamond"]["threads"]
params:
outfmt="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
conda:
os.path.join(ENV_DIR, "diamond.yaml")
message:
"Analysis: DIAMOND search: {input}"
shell:
"(date && "
"daa={output.daa} && "
"diamond blastp -q {input.faa} --db {input.db} --out {output.daa} -p {threads} --outfmt 100 && "
"diamond view --daa ${{daa%.*}} --max-target-seqs 1 -p {threads} --outfmt {params.outfmt} --out {output.tsv} && "
"date) &> {log}"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment