Skip to content
Snippets Groups Projects
Commit c86339aa authored by Susheel Busi's avatar Susheel Busi
Browse files

updated checkpoint rules

parent dc87ff4f
No related branches found
No related tags found
1 merge request!67WIP: Checkpoint snakefile
This diff is collapsed.
# Rules for BINNING workflow, i.e. generating MAGs from assemblies
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mm"]
#SAMPLES=config["samples"]
SAMPLES=["flye", "megahit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
###################
# Preparing files #
###################
rule prepare_assembly_files:
input:
ass1=os.path.join(RESULTS_DIR, "assembly/flye/lr/merged/no_barcode/assembly.fasta"),
ass2=os.path.join(RESULTS_DIR, "assembly/megahit/ONT3_MG_xx_Rashi_S11/final.contigs.fa"),
ass3=os.path.join(RESULTS_DIR, "assembly/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.fasta"),
bam1=os.path.join(RESULTS_DIR, "mapping/sr/bwa_mem/ONT3_MG_xx_Rashi_S11_reads-x-ONT3_MG_xx_Rashi_S11-megahit_contigs.bam"),
bam2=os.path.join(RESULTS_DIR, "mapping/lr/merged/no_barcode/no_barcode_reads-x-no_barcode-flye_contigs.bam")
output:
fa1=os.path.join(RESULTS_DIR, "assembly/flye.fa"),
fa2=os.path.join(RESULTS_DIR, "assembly/megahit.fa"),
fa3=os.path.join(RESULTS_DIR, "assembly/metaspades_hybrid.fa"),
# fa4=expand(os.path.join(RESULTS_DIR, "assembly/{mapper}_{reads}_{hybrid_assembler}.fa"), mapper=["bwa", "mmi"], reads=["sr", "lr", "merged"], hybrid_assembler=HYBRID_ASSEMBLER),
fa4=os.path.join(RESULTS_DIR, "assembly/bwa_sr_metaspades_hybrid.fa"),
fa5=os.path.join(RESULTS_DIR, "assembly/bwa_lr_metaspades_hybrid.fa"),
fa6=os.path.join(RESULTS_DIR, "assembly/bwa_merged_metaspades_hybrid.fa"),
fa7=os.path.join(RESULTS_DIR, "assembly/mmi_sr_metaspades_hybrid.fa"),
fa8=os.path.join(RESULTS_DIR, "assembly/mmi_lr_metaspades_hybrid.fa"),
fa9=os.path.join(RESULTS_DIR, "assembly/mmi_merged_metaspades_hybrid.fa"),
bout1=os.path.join(RESULTS_DIR, "mapping/megahit/megahit.bam"),
bout2=os.path.join(RESULTS_DIR, "mapping/flye/flye.bam")
shell:
"""
(date &&\
ln {input.ass1} {output.fa1}
ln {input.ass2} {output.fa2}
ln {input.ass3} {output.fa3}
ln {input.ass3} {output.fa4}
ln {input.ass3} {output.fa5}
ln {input.ass3} {output.fa6}
ln {input.ass3} {output.fa7}
ln {input.ass3} {output.fa8}
ln {input.ass3} {output.fa9}
ln {input.bam1} {output.bout1}
ln {input.bam2} {output.bout2}
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
#####################################
########### MAXBIN2 #################
#####################################
rule maxbin2:
input:
bbref=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"),
bbam=os.path.join(RESULTS_DIR, "mapping/{sample}/{sample}.bam")
threads:config["threads"]
conda:"../envs/binning.yaml"
output:
map=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}.sam"),
coverage=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_cov.txt"),
abund=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_abundance.txt"),
file=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/maxbin_output.001.fasta")
# mx=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/"))
shell:
"""
(date &&\
samtools view -h -o {output.map} {input.bbam} &&\
pileup.sh in={output.map} out={output.coverage} &&\
awk '{{print $1"\\t"$2}}' {output.coverage} | grep -v '^#' > {output.abund} &&\
run_MaxBin.pl -thread {threads} -contig {input.bbref} -out results/Binning/{wildcards.sample}/maxbin_output/maxbin_output -abund {output.abund} &&\
touch maxbin.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
#####################################
########### METABAT2 ################
#####################################
rule depth_files:
input:
dep1=os.path.join(RESULTS_DIR, "mapping/{sample}/{sample}.bam")
threads:config["threads"]
conda:"../envs/binning.yaml"
output:
depout=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_depth.txt")
shell:
"""
(date &&\
jgi_summarize_bam_contig_depths --outputDepth {output.depout} {input.dep1} &&\
touch depth_file.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule metabat2:
input:
met1=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"),
met2=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_depth.txt")
threads:config["threads"]
conda:"../envs/binning.yaml"
output:
outdir=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/metabat_output/"))
shell:
"""
(date &&\
metabat2 -i {input.met1} -a {input.met2} -o results/Binning/{wildcards.sample}/metabat_output/{wildcards.sample}.metabat -t {threads} -m 1500 -v --unbinned &&\
touch metabat2.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
#####################################
########### DAS_Tool ################
#####################################
rule scaffold_list:
input:
sca1=os.path.join(RESULTS_DIR, "Binning/{sample}/metabat_output/"),
# sca2=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/")
file=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/maxbin_output.001.fasta")
threads:config["threads"]
# conda:"/home/users/sbusi/apps/environments/base.yml"
conda: "../envs/dastool.yaml"
output:
scout1=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv"),
scout2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv")
shell:
"""
(date &&\
export Rscript={config[Rscript]} &&\
Fasta_to_Scaffolds2Bin.sh -i {input.sca1} -e fa > {output.scout1} &&\
Fasta_to_Scaffolds2Bin.sh -i results/Binning/{wildcards.sample}/maxbin_output -e fasta > {output.scout2} &&\
touch scaffold_list.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule DAS_Tool:
input:
da1=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv"),
da2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv"),
da3=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"),
db=config["dastool_database"]
threads:config["threads"]
# conda:"/home/users/sbusi/apps/environments/base.yml"
conda: "../envs/dastool.yaml"
params:
basename=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}")
output:
daout=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")),
dafile=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_proteins.faa"),
damfile=touch(os.path.join(RESULTS_DIR, "Binning/{sample}_das_tool.done"))
shell:
"""
(date &&\
export Rscript={config[Rscript]} &&\
DAS_Tool -i {input.da1},{input.da2} -c {input.da3} -o {params.basename} --search_engine diamond -l maxbin2,metabat2 --write_bins 1 --write_bin_evals 1 --threads {threads} --db_directory {input.db} --create_plots 1 &&\
touch {output.damfile} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# Rule for running the mapping "workflow" for ONT analsyes
#shell.executable("/bin/bash")
#shell.prefix("source ~/.bashrc; ")
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
#SAMPLES=config["samples"]
SAMPLES=["flye", "megahit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
#######################
## Mapping to hybrid ##
#######################
rule build_hybrid_bwa_index:
input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, hybrid_assembler=HYBRID_ASSEMBLER)
output:
# done=touch("bwa_index.done")
index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.amb"),
index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.ann"),
index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwt"),
index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.pac"),
index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.sa")
# wildcard_constraints:
# hybrid_assembler='^\/'
params:
prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}")
log: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwa_index")
conda: "../envs/bwa.yaml"
shell: """
(date &&\
bwa index {input} -p {params.prefix} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_map_short_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"),
query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"),
index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"),
index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"),
index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"),
index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"),
index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, hybrid_assembler=HYBRID_ASSEMBLER)
output:
os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}")
log: os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/sr_contigs.bwa_mem_samtools")
conda: "../envs/bwa.yaml"
threads:
config["bwa"]["threads"]
# wildcard_constraints:
# sample='\w+'
shell: """
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_map_long_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_lr=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"),
index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"),
index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"),
index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"),
index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, hybrid_assembler=HYBRID_ASSEMBLER)
output:
os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}"),
log: os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/lr_contigs.bwa_mem_samtools")
conda: "../envs/bwa.yaml"
threads: config["bwa"]["threads"]
# wildcard_constraints:
# sample='\w+'
shell: """
(date &&\
bwa mem -x ont2d -t {threads} {params.index_prefix} {input.query_lr} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_build_minimap2_long_read_contigs_ont_index:
input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, hybrid_assembler=HYBRID_ASSEMBLER)
output: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi")
conda: "../envs/minimap2.yaml"
shell:
"""
date
minimap2 -x map-ont -d {output} {input}
date
"""
rule hybrid_map_long_reads_to_hybrid_contigs_w_minimap2:
input:
query=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, hybrid_assembler=HYBRID_ASSEMBLER)
output: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}.bam")
params:
prefix=os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}")
conda: "../envs/minimap2.yaml"
threads: config["minimap2"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/metaspades.minimap2_samtools")
shell: """
(date &&\
minimap2 -a -t {threads} {input.index} {input.query} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_map_short_reads_to_hybrid_contigs_w_minimap2:
input:
query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"),
query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="ONT3_MG_xx_Rashi_S11"),
index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna"), sr_sample="ONT3_MG_xx_Rashi_S11", barcode=BARCODES, hybrid_assembler=HYBRID_ASSEMBLER)
output: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}.bam")
params:
prefix=os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}")
conda: "../envs/minimap2.yaml"
threads: config["minimap2"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/metaspades.minimap2_samtools")
shell:
"""
(date &&\
minimap2 -a -t {threads} {input.index} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output}
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_merge_bam_files_for_metaspades:
input:
bam_sr=os.path.join(RESULTS_DIR, "mapping/{mapper}_sr_{hybrid_assembler}/{mapper}_sr_{hybrid_assembler}.bam"),
bam_lr=os.path.join(RESULTS_DIR, "mapping/{mapper}_lr_{hybrid_assembler}/{mapper}_lr_{hybrid_assembler}.bam")
output: os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam")
conda: "../envs/binning.yaml"
shell:
"""
(date &&\
samtools merge {output} {input.bam_sr} {input.bam_lr} &&\
date)
"""
# Rules for mapping the metaT reads to the different assemblies
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
# SAMPLES=config["samples"]
SAMPLES=["flye", "megathit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
###########
## metaT ##
###########
# Preprocess the metaT reads using fastp
rule run_fastp_on_metaT_reads:
input:
r1=os.path.join(config["metaT_prefix"], "{metaT_sample}_R1_001.fastq.gz"),
r2=os.path.join(config["metaT_prefix"], "{metaT_sample}_R2_001.fastq.gz")
output:
r1=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_R1_001.fastp.fq.gz"),
r2=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_R2_001.fastp.fq.gz"),
html=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.html"),
json=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.json")
log: os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp")
conda: "../envs/fastp.yaml"
shell:
"""
(date &&\
fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule run_fastqc_on_preprocessed_metaT_reads:
input: os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_{suffix}.fq.gz"),
output:
html=os.path.join(RESULTS_DIR, "qc/metaT/fastqc/{metaT_sample}/{metaT_sample}_{suffix}.fastqc.html"),
zip=os.path.join(RESULTS_DIR, "qc/metaT/fastqc/{metaT_sample}/{metaT_sample}_{suffix}.fastqc.zip") # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename
conda: "../envs/fastqc.yaml"
script: "../scripts/run_fastqc.py" # Use a separate script rather than a simple call to FASTQC to overcome possible race conditions when running in parallel (https://bitbucket.org/snakemake/snakemake-wrappers/raw/7f5e01706728e32d52aa413f213e950f50bf4129/bio/fastqc/wrapper.py)
rule map_metaT_reads_to_long_read_contigs_w_bwa_mem:
input:
query_r1=rules.run_fastp_on_metaT_reads.output.r1,
query_r2=rules.run_fastp_on_metaT_reads.output.r2,
index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.amb"),
index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.ann"),
index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.bwt"),
index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.pac"),
index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.sa"),
index_fa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.fna")
output: os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x_{barcode}_contigs")
conda: "../envs/bwa.yaml"
threads: config["bwa"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bwa_mem_samtools")
shell:
"""
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule map_metaT_reads_to_short_read_contigs_w_bwa_mem:
input:
query_r1=rules.run_fastp_on_metaT_reads.output.r1,
query_r2=rules.run_fastp_on_metaT_reads.output.r2,
index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.amb"),
index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.ann"),
index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.bwt"),
index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.pac"),
index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.sa"),
index_fa=os.path.join(RESULTS_DIR, "assembly/megahit/{sr_sample}/final.contigs.fna")
output: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs")
conda: "../envs/bwa.yaml"
threads: config["bwa"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bwa_mem_samtools")
shell:
"""
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule map_metaT_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_r1=rules.run_fastp_on_metaT_reads.output.r1,
query_r2=rules.run_fastp_on_metaT_reads.output.r2,
index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.amb"),
index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.ann"),
index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.bwt"),
index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.pac"),
index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.sa"),
index_fa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna")
output: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs")
log: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bwa_mem_samtools")
conda: "../envs/bwa.yaml"
threads: config["bwa"]["threads"]
shell:
"""
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# For running the MMSEQ2 comparison of proteins after assemblies are run through prokka/prodigal
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mm"]
# SAMPLES=config["samples"]
SAMPLES=["flye", "megahit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
#############################
######## MMSEQ2 ############
#############################
rule mmseq2_database:
input:
int1=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr/merged/{barcode}/assembly.faa"), assembler="flye", barcode=BARCODES),
int2=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{sr_sample}/final.contigs.faa"), assembler="megahit", sr_sample="ONT3_MG_xx_Rashi_S11"),
int3=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.faa"), assembler="metaspades_hybrid", barcode=BARCODES, sr_sample="ONT3_MG_xx_Rashi_S11")
# expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{prefix}.faa"), assembler=["flye", "megahit", "metaspades_hybrid"])
output:
dat1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"),
dat2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"),
dat3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/database.mmseq2.log")
# conda: "../envs/cd-hit.yaml"
shell:
"""
(date &&\
{config[mmseqs][createdb]} {input.int1} {output.dat1} &&\
{config[mmseqs][createdb]} {input.int2} {output.dat2} &&\
{config[mmseqs][createdb]} {input.int3} {output.dat3} &&\
date) &> >(tee {log})
"""
rule mmseq2_compare:
input:
mm1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"),
mm2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"),
mm3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid")
output:
mo1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"),
mo2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"),
mo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/compare.mmseq2.log")
threads: config["mmseq2"]["threads"]
# conda: "../envs/cd-hit.yaml"
shell:
"""
(date &&\
{config[mmseqs][rbh]} {input.mm1} {input.mm2} {output.mo1} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
{config[mmseqs][rbh]} {input.mm1} {input.mm3} {output.mo2} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
{config[mmseqs][rbh]} {input.mm2} {input.mm3} {output.mo3} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
date) &> >(tee {log})
"""
rule mmseq2_m8_format:
input:
fo1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_db"),
fo2=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_db"),
fo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/metaspades_hybrid_db"),
fo4=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"),
fo5=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"),
fo6=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")
output:
form1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit.m8"),
form2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid.m8"),
form3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/convertalis.mmseq2.log")
# conda: "../envs/cd-hit.yaml"
shell:
"""
(date &&\
{config[mmseqs][convertalis]} {input.fo1} {input.fo2} {input.fo4} {output.form1} &&\
{config[mmseqs][convertalis]} {input.fo1} {input.fo3} {input.fo5} {output.form2} &&\
{config[mmseqs][convertalis]} {input.fo2} {input.fo3} {input.fo6} {output.form3} &&\
date) &> >(tee {log})
"""
rule prepare_plot_files:
input: rules.mmseq2_m8_format.output
output:
touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done")),
prep1=os.path.join(RESULTS_DIR, "annotation/mmseq2/total.txt"),
prep2=os.path.join(RESULTS_DIR, "annotation/mmseq2/overlap_sizes.txt")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/files_ready.mmseq2.log")
shell:
"""
(date &&\
scripts/prepare_plot_files.sh &&\
date) &> >(tee {log})
"""
rule mmseq2_plots:
input:
plot1=os.path.join(RESULTS_DIR, "annotation/mmseq2/overlap_sizes.txt"),
plot2=os.path.join(RESULTS_DIR, "annotation/mmseq2/total.txt")
output:
touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/upset_plots.done"))
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/plot.mmseq2.log")
conda: "../envs/renv.yaml"
shell:
"""
(date &&\
Rscript scripts/mmseq_plots.R {input.plot1} {input.plot2} &&\
date) &> >(tee {log})
"""
# For running the contamination check and taxonomy on the generated MAGs
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mm"]
#SAMPLES=config["samples"]
SAMPLES=["flye", "megahit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
#####################################
######## GTDBTK & CHECKM ############
#####################################
rule gtdbtk:
input:
gen=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")
threads:config["threads"]
conda:"../envs/gtdbtk.yaml"
params:
outdir=os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}")
output:
# gtdb=directory(os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}")),
gtdbfile=os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv")
wildcard_constraints:
sample='\w+'
shell:
"""
(date &&\
export GTDBTK_DATA_PATH={config[GTDBTK][DATA]} &&\
gtdbtk classify_wf --cpus {threads} -x fa --genome_dir {input.gen} --out_dir {params.outdir} &&\
touch gtdbtk.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule checkm:
input:
chkm=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")
threads:config["threads"]
conda:"../envs/checkm.yaml"
output:
chkmdir=directory(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}/lineage.ms")),
chkout=os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}_output.txt")
shell:
"""
(date &&\
checkm lineage_wf -r -t {threads} -x fa {input.chkm} {output.chkmdir} | tee {output.chkout} &&\
touch checkm.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule plot_checkm:
input:
pl1=os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}"),
pl2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")
threads:config["threads"]
conda:"../envs/checkm.yaml"
output:
plt=directory(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}/plots"))
shell:
"""
date
checkm bin_qa_plot --image_type png --width 8 --dpi 200 --font_size 8 -x fa -q {input.pl1} {input.pl2} {output.plt}
touch plot_checkm.done
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment