-
Susheel Busi authoredSusheel Busi authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
snakefile 13.88 KiB
"""
Author: Susheel Bhanu BUSI
Affiliation: ESB group LCSB UniLU
Date: [2019-09-13]
Run: snakemake -s binning_snakefile
Latest modification:
"""
configfile:"config.yaml"
DATA_DIR=config['data_dir']
RESULTS_DIR=config['results_dir']
SAMPLES=config['samples']
# DATABASE=config['database']
###########
rule all:
# input: expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/metaspades.bwt"))
# input: expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/{reads}/metaspades.bam"), reads=["sr"])
# input: expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/{reads}/metaspades.bam"), reads=["sr", "lr"])
# input: expand(os.path.join(DATA_DIR, "{sample}/dastool_output/{sample}/"), sample=SAMPLES),
# input: expand(os.path.join(DATA_DIR, "checkm_output/{sample}/lineage.ms"), sample=SAMPLES)
######
# WRAPPER RULES
######
# Added the `shell:` so as to avoid it from being autodetected as `localrule`. This is needed so that an email can be sent upon event changes for this rule.
rule MAPPING:
input:
expand(os.path.join(DATA_DIR, "metaspades.bwt")),
expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/sr/metaspades.bam")),
expand(os.path.join(DATA_DIR, "metaspades/bwa-mem/lr/metaspades.bam")),
expand(os.path.join(DATA_DIR, "metaspades/mmi/lr/lr_hybrid.mmi")),
expand(os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades.bam")),
expand(os.path.join(DATA_DIR, "metaspades/mmi/sr/sr_hybrid.mmi")),
expand(os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades.bam")),
expand(os.path.join(DATA_DIR, "metaspades/{mapper}/merged_bam/metaspades.bam"), mapper=["bwa-mem", "mmi"])
output: "bwa_and_mmi_mapping.done"
shell:
"""
touch {output}
"""
rule BINNING:
input:
expand(os.path.join(DATA_DIR, "{sample}/dastool_output/{sample}"), sample=SAMPLES),
expand(os.path.join(DATA_DIR, "checkm_output/{sample}/lineage.ms"), sample=SAMPLES)
# expand(os.path.join(DATA_DIR, "gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv"), sample=SAMPLES)
#######################
## Mapping to hybrid ##
#######################
rule build_bwa_index:
input: "metaspades.fna"
output:
# done=touch("bwa_index.done")
index_amb=os.path.join(DATA_DIR, "{prefix}.amb"),
index_ann=os.path.join(DATA_DIR, "{prefix}.ann"),
index_bwt=os.path.join(DATA_DIR, "{prefix}.bwt"),
index_pac=os.path.join(DATA_DIR, "{prefix}.pac"),
index_sa=os.path.join(DATA_DIR, "{prefix}.sa")
log: os.path.join(DATA_DIR, "metaspades/{prefix}.bwa_index")
conda: "envs/bwa.yaml"
shell: """
(date &&\
bwa index {input} -p {wildcards.prefix} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule map_short_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_r1="NEB2_MG_S17_R1_001.fastp.fq.gz",
query_r2="NEB2_MG_S17_R2_001.fastp.fq.gz",
index_amb=os.path.join(DATA_DIR, "metaspades.amb"),
index_ann=os.path.join(DATA_DIR, "metaspades.ann"),
index_bwt=os.path.join(DATA_DIR, "metaspades.bwt"),
index_pac=os.path.join(DATA_DIR, "metaspades.pac"),
index_sa=os.path.join(DATA_DIR, "metaspades.sa"),
index_fa=os.path.join(DATA_DIR, "metaspades.fna")
output:
os.path.join(DATA_DIR, "metaspades/bwa-mem/sr/metaspades.bam")
params:
index_prefix=os.path.join(DATA_DIR, "metaspades"),
out_prefix=os.path.join(DATA_DIR, "metaspades/bwa-mem/sr/metaspades")
log: os.path.join(DATA_DIR, "metaspades/sr_contigs.bwa_mem_samtools")
conda: "envs/bwa.yaml"
threads:
config["bwa"]["threads"]
shell: """
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule map_long_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_lr="barcode07.fastq.gz",
index_amb=os.path.join(DATA_DIR, "metaspades.amb"),
index_ann=os.path.join(DATA_DIR, "metaspades.ann"),
index_bwt=os.path.join(DATA_DIR, "metaspades.bwt"),
index_pac=os.path.join(DATA_DIR, "metaspades.pac"),
index_sa=os.path.join(DATA_DIR, "metaspades.sa"),
index_fa=os.path.join(DATA_DIR, "metaspades.fna")
output:
os.path.join(DATA_DIR, "metaspades/bwa-mem/lr/metaspades.bam")
params:
index_prefix=os.path.join(DATA_DIR, "metaspades"),
out_prefix=os.path.join(DATA_DIR, "metaspades/bwa-mem/lr/metaspades")
log: os.path.join(DATA_DIR, "metaspades/lr_contigs.bwa_mem_samtools")
conda: "envs/bwa.yaml"
threads: config["bwa"]["threads"]
shell: """
(date &&\
bwa mem -x ont2d -t {threads} {params.index_prefix} {input.query_lr} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule build_minimap2_long_read_contigs_ont_index:
input: "metaspades.fna"
output: os.path.join(DATA_DIR, "metaspades/mmi/lr/lr_hybrid.mmi")
conda: "envs/minimap2.yaml"
shell:
"""
date
minimap2 -x map-ont -d {output} {input}
date
"""
rule map_long_reads_to_long_read_contigs:
input:
query="barcode07.fastq.gz",
index=os.path.join(DATA_DIR, "metaspades/mmi/lr/lr_hybrid.mmi"),
index_fa="metaspades.fna"
output: os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades.bam")
params:
prefix=os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades")
conda: "envs/minimap2.yaml"
threads: config["minimap2"]["threads"]
log: os.path.join(DATA_DIR, "metaspades/mmi/lr/metaspades.minimap2_samtools")
shell: """
(date &&\
minimap2 -a -t {threads} {input.index} {input.query} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule build_minimap2_long_read_contigs_sr_index:
input: "metaspades.fna"
output: os.path.join(DATA_DIR, "metaspades/mmi/sr/sr_hybrid.mmi")
conda: "envs/minimap2.yaml"
shell:
"""
date
minimap2 -x sr -d {output} {input}
date
"""
rule map_short_reads_to_long_read_contigs_w_minimap2:
input:
query_r1="NEB2_MG_S17_R1_001.fastp.fq.gz",
query_r2="NEB2_MG_S17_R2_001.fastp.fq.gz",
index=os.path.join(DATA_DIR, "metaspades/mmi/sr/sr_hybrid.mmi"),
index_fa="metaspades.fna"
output: os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades.bam")
params:
prefix=os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades")
conda: "envs/minimap2.yaml"
threads: config["minimap2"]["threads"]
log: os.path.join(DATA_DIR, "metaspades/mmi/sr/metaspades.minimap2_samtools")
shell:
"""
(date &&\
minimap2 -a -t {threads} {input.index} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output}
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule merge_bam_files_for_metaspades:
input:
bam_sr=os.path.join(DATA_DIR, "metaspades/{mapper}/sr/metaspades.bam"),
bam_lr=os.path.join(DATA_DIR, "metaspades/{mapper}/lr/metaspades.bam")
output: os.path.join(DATA_DIR, "metaspades/{mapper}/merged_bam/metaspades.bam")
conda: "/home/users/sbusi/apps/environments/binning.yml"
shell:
"""
(date &&\
samtools merge {output} {input.bam_sr} {input.bam_lr} &&\
date)
"""
#####################################
########### MAXBIN2 #################
#####################################
rule maxbin2:
input:
bbref="{datadir}/{sample}.fna",
bbam="{datadir}/{sample}/{sample}.bam"
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/binning.yml"
output:
map="{datadir}/{sample}/{sample}.sam",
coverage="{datadir}/{sample}/{sample}_cov.txt",
abund="{datadir}/{sample}/{sample}_abundance.txt",
file="{datadir}/{sample}/maxbin_output/maxbin_output.001.fasta"
# mx=directory("{datadir}/{sample}/maxbin_output/")
shell:
"""
date
samtools view -h -o {output.map} {input.bbam}
pileup.sh in={output.map} out={output.coverage}
awk '{{print $1"\\t"$2}}' {output.coverage} | grep -v '^#' > {output.abund}
run_MaxBin.pl -thread {threads} -contig {input.bbref} -out {wildcards.datadir}/{wildcards.sample}/maxbin_output/maxbin_output -abund {output.abund}
touch maxbin.done
date
"""
#####################################
########### METABAT2 ################
#####################################
rule depth_files:
input:
dep1="{datadir}/{sample}/{sample}.bam"
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/binning.yml"
output:
depout="{datadir}/{sample}/{sample}_depth.txt"
shell:
"""
date
jgi_summarize_bam_contig_depths --outputDepth {output.depout} {input.dep1}
touch depth_file.done
date
"""
rule metabat2:
input:
met1="{datadir}/{sample}.fna",
met2="{datadir}/{sample}/{sample}_depth.txt"
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/binning.yml"
output:
outdir=directory("{datadir}/{sample}/metabat_output/")
shell:
"""
date
metabat2 -i {input.met1} -a {input.met2} -o {wildcards.datadir}/{wildcards.sample}/metabat_output/{wildcards.sample}.metabat -t {threads} -m 1500 -v --unbinned
touch metabat2.done
date
"""
#####################################
########### DAS_Tool ################
#####################################
rule scaffold_list:
input:
sca1="{datadir}/{sample}/metabat_output/",
# sca2="{datadir}/{sample}/maxbin_output/"
file = "{datadir}/{sample}/maxbin_output/maxbin_output.001.fasta"
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/base.yml"
output:
scout1="{datadir}/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv",
scout2="{datadir}/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv"
shell:
"""
date
Fasta_to_Scaffolds2Bin.sh -i {input.sca1} -e fa > {output.scout1}
Fasta_to_Scaffolds2Bin.sh -i {wildcards.datadir}/{wildcards.sample}/maxbin_output -e fasta > {output.scout2}
date
touch scaffold_list.done
"""
rule DAS_Tool:
input:
da1="{datadir}/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv",
da2="{datadir}/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv",
da3="{datadir}/{sample}.fna",
db=config["dastool_database"]
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/base.yml"
output:
daout=directory("{datadir}/{sample}/dastool_output/{sample}")
shell:
"""
date
DAS_Tool -i {input.da1},{input.da2} -c {input.da3} -o {output.daout} --search_engine diamond -l maxbin2,metabat2 --write_bins 1 --write_bin_evals 1 --threads {threads} --db_directory {input.db} --create_plots 1 &&\
2> >(tee {log}.stderr) > >(tee {log}.stdout)
touch das_tool.done
date
"""
#checkpoint collect_bins:
# input:
# col1="{datadir}/{sample}/dastool_output"
# col1="{datadir}/{sample}/dastool_output/{file_i}.fa"
# output:
# colout="{datadir}/{sample}/dastool_output/high_quality_bins/{file_i}.fa"
# shell: """
# date
# cp {input.col1} {output.colout}
# date
# """
#####################################
######## GTDBTK & CHECKM ############
#####################################
rule gtdbtk:
input:
gen="{datadir}/{sample}/dastool_output/{sample}/"
threads:config["threads"]
conda:"gtdbtk.yml"
output:
gtdb=directory("{datadir}/gtdbtk_output/{sample}")
shell:"""
date
gtdbtk classify_wf --cpus {threads} -x fa --genome_dir {input.gen} --out_dir {output.gtdb}
touch gtdbtk.done
date
"""
rule checkm:
input:
chkm="{datadir}/{sample}/dastool_output/{sample}/"
threads:config["threads"]
conda:"checkm.yml"
output:
chkmdir=directory("{datadir}/checkm_output/{sample}/lineage.ms")
shell:
"""
date
checkm lineage_wf -r -t {threads} -x fa {input.chkm} {output.chkmdir}
touch checkm.done
date
"""
rule plot_checkm:
input:
pl1="{datadir}/checkm_output/{sample}",
pl2="{datadir}/{sample}/dastool_output/high_quality_bins"
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/checkm.yml"
output:
plt=directory("{datadir}/checkm_output/{sample}/plots")
shell:
"""
date
checkm bin_qa_plot --image_type png --width 8 --dpi 200 --font_size 8 -x fa -q {input.pl1} {input.pl2} {output.plt}
touch plot_checkm.done
date
"""