Skip to content
Snippets Groups Projects
Commit dc8a6b4f authored by Susheel Busi's avatar Susheel Busi
Browse files

created modular parts of the SNAKEFILE for ease of handling and downstream processes

parent 898bd724
No related branches found
No related tags found
1 merge request!51created modular parts of the SNAKEFILE for ease of handling and downstream processes
Showing
with 940 additions and 0 deletions
File added
WorkflowError in line 6 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/BINNING_SNAKEFILE:
Config file CONFIG.yaml not found.
File "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/BINNING_SNAKEFILE", line 6, in <module>
Building DAG of jobs...
MissingInputException in line 56 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/BINNING_SNAKEFILE:
Missing input files for rule depth_files:
results/mapping/flye/flye.bam
WorkflowError in line 8 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping:
Config file CONFIG.yaml not found.
File "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping", line 8, in <module>
CreateRuleException in line 23 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping:
The name MAPPING is already used by another rule
File "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping", line 23, in <module>
# Rules for BINNING workflow, i.e. generating MAGs from assemblies
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mm"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
#####################################
########### MAXBIN2 #################
#####################################
rule maxbin2:
input:
bbref=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"),
bbam=os.path.join(RESULTS_DIR, "mapping/{sample}/{sample}.bam")
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/binning.yml"
output:
map=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}.sam"),
coverage=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_cov.txt"),
abund=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_abundance.txt"),
file=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/maxbin_output.001.fasta")
# mx=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/"))
shell:
"""
(date &&\
samtools view -h -o {output.map} {input.bbam} &&\
pileup.sh in={output.map} out={output.coverage} &&\
awk '{{print $1"\\t"$2}}' {output.coverage} | grep -v '^#' > {output.abund} &&\
run_MaxBin.pl -thread {threads} -contig {input.bbref} -out results/Binning/{wildcards.sample}/maxbin_output/maxbin_output -abund {output.abund} &&\
touch maxbin.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
#####################################
########### METABAT2 ################
#####################################
rule depth_files:
input:
dep1=os.path.join(RESULTS_DIR, "mapping/{sample}/{sample}.bam")
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/binning.yml"
output:
depout=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_depth.txt")
shell:
"""
(date &&\
jgi_summarize_bam_contig_depths --outputDepth {output.depout} {input.dep1} &&\
touch depth_file.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule metabat2:
input:
met1=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"),
met2=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_depth.txt")
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/binning.yml"
output:
outdir=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/metabat_output/"))
shell:
"""
(date &&\
metabat2 -i {input.met1} -a {input.met2} -o results/Binning/{wildcards.sample}/metabat_output/{wildcards.sample}.metabat -t {threads} -m 1500 -v --unbinned &&\
touch metabat2.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
#####################################
########### DAS_Tool ################
#####################################
rule scaffold_list:
input:
sca1=os.path.join(RESULTS_DIR, "Binning/{sample}/metabat_output/"),
# sca2=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/")
file=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/maxbin_output.001.fasta")
threads:config["threads"]
# conda:"/home/users/sbusi/apps/environments/base.yml"
conda: "dastool.yaml"
output:
scout1=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv"),
scout2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv")
shell:
"""
(date &&\
export Rscript={config[Rscript]} &&\
Fasta_to_Scaffolds2Bin.sh -i {input.sca1} -e fa > {output.scout1} &&\
Fasta_to_Scaffolds2Bin.sh -i results/Binning/{wildcards.sample}/maxbin_output -e fasta > {output.scout2} &&\
touch scaffold_list.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule DAS_Tool:
input:
da1=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv"),
da2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv"),
da3=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"),
db=config["dastool_database"]
threads:config["threads"]
# conda:"/home/users/sbusi/apps/environments/base.yml"
conda: "dastool.yaml"
params:
basename=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}")
output:
daout=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")),
dafile=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_proteins.faa"),
damfile=touch(os.path.join(RESULTS_DIR, "Binning/{sample}_das_tool.done"))
shell:
"""
(date &&\
export Rscript={config[Rscript]} &&\
DAS_Tool -i {input.da1},{input.da2} -c {input.da3} -o {params.basename} --search_engine diamond -l maxbin2,metabat2 --write_bins 1 --write_bin_evals 1 --threads {threads} --db_directory {input.db} --create_plots 1 &&\
touch {output.damfile} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
data_dir: "data"
results_dir: "results"
db_dir: "dbs"
runs:
first: "20181106_1450_noselection_sizeselection"
second: "20181107_0906_same"
third: "20181108_0827_test"
barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"]
assemblers: ["flye"]
p7zip:
bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
threads: 4
ont_fast5_api:
single_to_multi_fast5:
bin: "single_to_multi_fast5"
batch: 8000
threads: 8
flowcell: "FLO-MIN106"
kit: "SQK-LSK108"
#barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"]
barcodes: ["barcode07"]
guppy_cpu:
path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
version: "cpu-3.1.5"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
cpu_threads: 28
guppy_gpu:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version: "3.4.5+fb1fbfb"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
hac_config: "dna_r9.4.1_450bps_hac.cfg"
records_per_fastq: 8000
chunk_size: 1000
chunks_per_runner: 1000
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
cpu_threads: 28
guppy_barcoder:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
version: "3.4.5+fb1fbfb"
records_per_fastq: 8000
threads: 8
nanostats:
#short_reads_prefix: "/scratch/users/claczny/ont/fecal_pilot/data/raw/short_reads"
short_reads_prefix: "/mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Apr/fastq"
metaT_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/metaT/2018_GDB"
#samples: ["Kapa1_MG_S18", "Kapa2_MG_S19", "NEB1_MG_S16", "NEB2_MG_S17", "NEBmod1_MG_Rashi_S22", "NEBmod2_MG_Rashi_S23"]
fastp:
min_length: 40
minimap2:
threads: 16
igc:
uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
hg38:
uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
genomecov:
bin: "bedtools genomecov"
compute_avg_coverage:
bin: "scripts/coverage.awk"
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
flye:
bin: "flye"
threads: 27
genome_size: "1g"
operams:
bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
threads: 28
megahit:
threads: 28
nonpareil:
memory: 4096
threads: 14
medaka:
threads: 28
racon:
threads: 28
rebaler:
threads: 28
diamond:
threads: 28
db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
metaspades:
threads: 28
mmseq2:
threads: 24
# Define sample names
# samples: ["flye", "megahit", "metaspades"]
# samples: ["flye", "megahit"]
samples: ["metaspades_hybrid"]
#binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
binning_samples: ["megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
# Hybrid assembler
hybrid_assembler: "metaspades_hybrid"
# Directory where fastq files are
#data_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
# Directory to save the output to
#results_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
# Number of cpus or threads to use
threads: 28
# Path to the the 140GB Kraken2 database
kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# Path to DAS_Tool
DAS_Tool:
path: "/home/users/sbusi/apps/DAS_Tool-master"
bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
# Path to DAS_Tool database
dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Mapping options
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
minimap2:
threads: 24
# Path to GTDBTK database
GTDBTK:
DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# Rscript path
Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# Rule for running the mapping "workflow" for ONT analsyes
#shell.executable("/bin/bash")
#shell.prefix("source ~/.bashrc; ")
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
#######################
## Mapping to hybrid ##
#######################
rule build_hybrid_bwa_index:
input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
output:
# done=touch("bwa_index.done")
index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.amb"),
index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.ann"),
index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwt"),
index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.pac"),
index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.sa")
params:
prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}")
log: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwa_index")
conda: "envs/bwa.yaml"
shell: """
(date &&\
bwa index {input} -p {params.prefix} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_map_short_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"),
query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"),
index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"),
index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"),
index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"),
index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"),
index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
output:
os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}")
log: os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/sr_contigs.bwa_mem_samtools")
conda: "envs/bwa.yaml"
threads:
config["bwa"]["threads"]
# wildcard_constraints:
# sample='\w+'
shell: """
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_map_long_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_lr=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"),
index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"),
index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"),
index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"),
index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
output:
os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}"),
log: os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/lr_contigs.bwa_mem_samtools")
conda: "envs/bwa.yaml"
threads: config["bwa"]["threads"]
# wildcard_constraints:
# sample='\w+'
shell: """
(date &&\
bwa mem -x ont2d -t {threads} {params.index_prefix} {input.query_lr} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_build_minimap2_long_read_contigs_ont_index:
input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
output: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi")
conda: "envs/minimap2.yaml"
shell:
"""
date
minimap2 -x map-ont -d {output} {input}
date
"""
rule hybrid_map_long_reads_to_hybrid_contigs_w_minimap2:
input:
query=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode="barcode07"),
index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
output: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}.bam")
params:
prefix=os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}")
conda: "envs/minimap2.yaml"
threads: config["minimap2"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/metaspades.minimap2_samtools")
shell: """
(date &&\
minimap2 -a -t {threads} {input.index} {input.query} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_map_short_reads_to_hybrid_contigs_w_minimap2:
input:
query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"),
query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"),
index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"),
index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
output: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}.bam")
params:
prefix=os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}")
conda: "envs/minimap2.yaml"
threads: config["minimap2"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/metaspades.minimap2_samtools")
shell:
"""
(date &&\
minimap2 -a -t {threads} {input.index} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output}
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule hybrid_merge_bam_files_for_metaspades:
input:
bam_sr=os.path.join(RESULTS_DIR, "mapping/{mapper}_sr_{hybrid_assembler}/{mapper}_sr_{hybrid_assembler}.bam"),
bam_lr=os.path.join(RESULTS_DIR, "mapping/{mapper}_lr_{hybrid_assembler}/{mapper}_lr_{hybrid_assembler}.bam")
output: os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam")
conda: "/home/users/sbusi/apps/environments/binning.yml"
shell:
"""
(date &&\
samtools merge {output} {input.bam_sr} {input.bam_lr} &&\
date)
"""
# Rules for mapping the metaT reads to the different assemblies
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
###########
## metaT ##
###########
# Preprocess the metaT reads using fastp
rule run_fastp_on_metaT_reads:
input:
r1=os.path.join(config["metaT_prefix"], "{metaT_sample}_R1_001.fastq.gz"),
r2=os.path.join(config["metaT_prefix"], "{metaT_sample}_R2_001.fastq.gz")
output:
r1=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_R1_001.fastp.fq.gz"),
r2=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_R2_001.fastp.fq.gz"),
html=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.html"),
json=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.json")
log: os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp")
conda: "envs/fastp.yaml"
shell:
"""
(date &&\
fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule run_fastqc_on_preprocessed_metaT_reads:
input: os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_{suffix}.fq.gz"),
output:
html=os.path.join(RESULTS_DIR, "qc/metaT/fastqc/{metaT_sample}/{metaT_sample}_{suffix}.fastqc.html"),
zip=os.path.join(RESULTS_DIR, "qc/metaT/fastqc/{metaT_sample}/{metaT_sample}_{suffix}.fastqc.zip") # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename
conda: "envs/fastqc.yaml"
script: "scripts/run_fastqc.py" # Use a separate script rather than a simple call to FASTQC to overcome possible race conditions when running in parallel (https://bitbucket.org/snakemake/snakemake-wrappers/raw/7f5e01706728e32d52aa413f213e950f50bf4129/bio/fastqc/wrapper.py)
rule map_metaT_reads_to_long_read_contigs_w_bwa_mem:
input:
query_r1=rules.run_fastp_on_metaT_reads.output.r1,
query_r2=rules.run_fastp_on_metaT_reads.output.r2,
index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.amb"),
index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.ann"),
index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.bwt"),
index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.pac"),
index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.sa"),
index_fa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.fna")
output: os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x_{barcode}_contigs")
conda: "envs/bwa.yaml"
threads: config["bwa"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bwa_mem_samtools")
shell:
"""
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule map_metaT_reads_to_short_read_contigs_w_bwa_mem:
input:
query_r1=rules.run_fastp_on_metaT_reads.output.r1,
query_r2=rules.run_fastp_on_metaT_reads.output.r2,
index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.amb"),
index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.ann"),
index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.bwt"),
index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.pac"),
index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.sa"),
index_fa=os.path.join(RESULTS_DIR, "assembly/megahit/{sr_sample}/final.contigs.fna")
output: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs")
conda: "envs/bwa.yaml"
threads: config["bwa"]["threads"]
log: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bwa_mem_samtools")
shell:
"""
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule map_metaT_reads_to_hybrid_contigs_w_bwa_mem:
input:
query_r1=rules.run_fastp_on_metaT_reads.output.r1,
query_r2=rules.run_fastp_on_metaT_reads.output.r2,
index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.amb"),
index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.ann"),
index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.bwt"),
index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.pac"),
index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.sa"),
index_fa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna")
output: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bam")
params:
index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs"),
out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs")
log: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bwa_mem_samtools")
conda: "envs/bwa.yaml"
threads: config["bwa"]["threads"]
shell:
"""
(date &&\
bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# For running the MMSEQ2 comparison of proteins after assemblies are run through prokka/prodigal
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mm"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
#############################
######## MMSEQ2 ############
#############################
rule mmseq2_database:
input:
int1=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr/merged/{barcode}/assembly.faa"), assembler="flye", barcode=BARCODES),
int2=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{sample}/final.contigs.faa"), assembler="megahit", sample=SAMPLES),
int3=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr_{barcode}-sr_{sample}/contigs.faa"), assembler="metaspades_hybrid", barcode=BARCODES, sample=SAMPLES)
# expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{prefix}.faa"), assembler=["flye", "megahit", "metaspades_hybrid"])
output:
dat1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"),
dat2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"),
dat3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/database.mmseq2.log")
conda: "cd-hit.yaml"
shell:
"""
(date &&\
mmseqs createdb {input.int1} {output.dat1} &&\
mmseqs createdb {input.int2} {output.dat2} &&\
mmseqs createdb {input.int3} {output.dat3} &&\
date) &> >(tee {log})
"""
rule mmseq2_compare:
input:
mm1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"),
mm2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"),
mm3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid")
output:
mo1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"),
mo2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"),
mo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/compare.mmseq2.log")
threads: config["mmseq2"]["threads"]
conda: "cd-hit.yaml"
shell:
"""
(date &&\
mmseqs rbh {input.mm1} {input.mm2} {output.mo1} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
mmseqs rbh {input.mm1} {input.mm3} {output.mo2} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
mmseqs rbh {input.mm2} {input.mm3} {output.mo3} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
date) &> >(tee {log})
"""
rule mmseq2_m8_format:
input:
fo1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_db"),
fo2=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_db"),
fo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/metaspades_hybrid_db"),
fo4=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"),
fo5=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"),
fo6=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")
output:
form1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit.m8"),
form2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid.m8"),
form3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/convertalis.mmseq2.log")
conda: "cd-hit.yaml"
shell:
"""
(date &&\
mmseqs convertalis {input.fo1} {input.fo2} {input.fo4} {output.form1} &&\
mmseqs convertalis {input.fo1} {input.fo3} {input.fo5} {output.form2} &&\
mmseqs convertalis {input.fo2} {input.fo3} {input.fo6} {output.form3} &&\
date) &> >(tee {log})
"""
rule prepare_plot_files:
output:
touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done"))
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/files_ready.mmseq2.log")
shell:
"""
(date &&\
./prepare_plot_files.sh &&\
date) &> >(tee {log})
"""
rule mmseq2_plots:
input:
plot1=os.path.join(RESULTS_DIR, "annotation/mmseq2/overlap_sizes.txt"),
plot2=os.path.join(RESULTS_DIR, "annotation/mmseq2/total.txt")
output:
touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/upset_plots.done"))
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/plot.mmseq2.log")
conda: "renv.yaml"
shell:
"""
(date &&\
Rscript mmseq_plots.R {input.plot1} {input.plot2} &&\
date) &> >(tee {log})
"""
# For running the contamination check and taxonomy on the generated MAGs
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mm"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
#####################################
######## GTDBTK & CHECKM ############
#####################################
rule gtdbtk:
input:
gen=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")
threads:config["threads"]
conda:"gtdbtk.yaml"
params:
outdir=os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}")
output:
# gtdb=directory(os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}")),
gtdbfile=os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv")
wildcard_constraints:
sample='\w+'
shell:
"""
(date &&\
export GTDBTK_DATA_PATH={config[GTDBTK][DATA]} &&\
gtdbtk classify_wf --cpus {threads} -x fa --genome_dir {input.gen} --out_dir {params.outdir} &&\
touch gtdbtk.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule checkm:
input:
chkm=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")
threads:config["threads"]
conda:"checkm.yml"
output:
chkmdir=directory(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}/lineage.ms")),
chkout=os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}_output.txt")
shell:
"""
(date &&\
checkm lineage_wf -r -t {threads} -x fa {input.chkm} {output.chkmdir} | tee {output.chkout} &&\
touch checkm.done &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule plot_checkm:
input:
pl1=os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}"),
pl2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")
threads:config["threads"]
conda:"/home/users/sbusi/apps/environments/checkm.yml"
output:
plt=directory(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}/plots"))
shell:
"""
date
checkm bin_qa_plot --image_type png --width 8 --dpi 200 --font_size 8 -x fa -q {input.pl1} {input.pl2} {output.plt}
touch plot_checkm.done
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# For running the BINNING "workflow"
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
# specify which rules to run
include:
'BINNING_RULES'
# Rule all for BINNING
rule BINNING:
input:
expand(os.path.join(RESULTS_DIR, "assembly/{sample}.fa"), sample=BINNING_SAMPLES),
expand(os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_proteins.faa"), sample=BINNING_SAMPLES),
output:
touch('binning_for_ont.done')
# Workflow for running mapping steps of different assemblers, and mappers for the Binning workflow
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
# specify which rules to run
include:
'MAPPING_RULES'
# Rule all for mapping
rule MAPPING:
input:
expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"), hybrid_assembler=HYBRID_ASSEMBLER),
expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_{reads}_{hybrid_assembler}/{mapper}_{reads}_{hybrid_assembler}.bam"), mapper=MAPPERS, reads=["sr", "lr"], hybrid_assembler=HYBRID_ASSEMBLER),
expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"), hybrid_assembler=HYBRID_ASSEMBLER),
expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam"), mapper=MAPPERS, hybrid_assembler=HYBRID_ASSEMBLER)
output:
touch('mapping_for_binning.done')
# Workflow for running mapping metaT reads to different assemblies using different mappers for the Binning workflow
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
# specify which rules to run
include:
'METAT_RULES'
# Rule all for mappinng metaT reads to all assemblies
rule METAT:
input:
expand(os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.{report_type}"), metaT_sample="GDB_2018_metaT", report_type=["html", "json"]),
# expand(os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}-x-{barcode}.bam"), metaT_sample="GDB_2018_metaT", barcode=BARCODES),
expand(os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bam"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", assembler="megahit"),
expand(os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bam"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", barcode=BARCODES, assembler="metaspades_hybrid"),
expand(os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bam"), metaT_sample="GDB_2018_metaT", barcode=BARCODES, assembler=ASSEMBLERS),
expand(os.path.join(RESULTS_DIR, "genomecov/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", assembler="megahit"),
expand(os.path.join(RESULTS_DIR, "genomecov/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.avg_cov.txt"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", barcode=BARCODES, assembler="metaspades_hybrid"),
expand(os.path.join(RESULTS_DIR, "genomecov/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.avg_cov.txt"), metaT_sample="GDB_2018_metaT", barcode=BARCODES, assembler=ASSEMBLERS)
output:
touch('metaT_mapping_for_ONT.done')
# For running the MMSEQ2 "workflow"
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
# specify which rules to run
include:
'MMSEQ_RULES'
# Rule all for running the MMSEQ2 analyses on the proteins after assembly
rule MMSEQ:
input:
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler=["flye", "megahit", "metaspades_hybrid"]),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit.m8")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid.m8")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done")),
# expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/upset_plots.done"))
output:
touch('mmseq_comparison_for_ont.done')
# For running the TAXONOMY "workflow"
import os
from tempfile import TemporaryDirectory
configfile: "CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
SAMPLES=config["samples"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
# specify which rules to run
include:
'TAXONOMY_RULES'
# Rule all for TAXONOMY
rule BINNING:
input:
expand(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}_output.txt"), sample=BINNING_SAMPLES),
expand(os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv"), sample=BINNING_SAMPLES)
output:
touch('taxonomy_for_ont.done')
# include global functions
include:
"workflow/rules/function.definitions.smk"
# include configuration file
include:
"workflow/rules/ini/config.smk"
# set working directory and dump output
workdir:
OUTPUTDIR
# Single omics MG workflow
elif MG:
if 'preprocessing' in IMP_STEPS:
if len(MG) == 2:
include:
"workflow/rules/modules/single_omics/mg/Preprocessing.smk"
if len(MG) == 1:
include:
"workflow/rules/modules/single_omics/mg/PreprocessingSE.smk"
if 'assembly' in IMP_STEPS:
include:
"workflow/rules/modules/single_omics/mg/Assembly.smk"
if 'analysis' in IMP_STEPS:
include:
"workflow/rules/modules/single_omics/mg/Analysis.smk"
if 'taxonomy' in IMP_STEPS:
include:
"workflow/rules/modules/single_omics/mg/Taxonomy.smk"
if 'binning' in IMP_STEPS:
include:
"workflow/rules/modules/single_omics/mg/Binning.smk"
# master command
rule ALL:
input:
inputs_all
output:
touch('status/all.done')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment