From dc8a6b4f63f83862f1ceb16d3c9e4ad1f8fc847d Mon Sep 17 00:00:00 2001 From: "Susheel Bhanu Busi susheel.busi@uni.lu" <susheel.busi@uni.lu> Date: Sat, 25 Apr 2020 02:03:49 +0200 Subject: [PATCH] created modular parts of the SNAKEFILE for ease of handling and downstream processes --- workflows/.Snakefile.swp | Bin 0 -> 12288 bytes .../2020-04-24T185411.179436.snakemake.log | 3 + .../2020-04-24T185419.525822.snakemake.log | 4 + .../2020-04-25T001612.138705.snakemake.log | 3 + .../2020-04-25T001631.788549.snakemake.log | 3 + workflows/BINNING_RULES | 125 +++++++++++++++ workflows/CONFIG.yaml | 148 +++++++++++++++++ workflows/MAPPING_RULES | 151 ++++++++++++++++++ workflows/METAT_RULES | 120 ++++++++++++++ workflows/MMSEQ_RULES | 110 +++++++++++++ workflows/TAXONOMY_RULES | 72 +++++++++ workflows/binning | 28 ++++ workflows/mapping | 29 ++++ workflows/metat | 33 ++++ workflows/mmseq | 36 +++++ workflows/taxonomy | 28 ++++ workflows/workflows_snakefile | 47 ++++++ 17 files changed, 940 insertions(+) create mode 100644 workflows/.Snakefile.swp create mode 100644 workflows/.snakemake/log/2020-04-24T185411.179436.snakemake.log create mode 100644 workflows/.snakemake/log/2020-04-24T185419.525822.snakemake.log create mode 100644 workflows/.snakemake/log/2020-04-25T001612.138705.snakemake.log create mode 100644 workflows/.snakemake/log/2020-04-25T001631.788549.snakemake.log create mode 100644 workflows/BINNING_RULES create mode 100644 workflows/CONFIG.yaml create mode 100644 workflows/MAPPING_RULES create mode 100644 workflows/METAT_RULES create mode 100644 workflows/MMSEQ_RULES create mode 100644 workflows/TAXONOMY_RULES create mode 100644 workflows/binning create mode 100644 workflows/mapping create mode 100644 workflows/metat create mode 100644 workflows/mmseq create mode 100644 workflows/taxonomy create mode 100644 workflows/workflows_snakefile diff --git a/workflows/.Snakefile.swp b/workflows/.Snakefile.swp new file mode 100644 index 0000000000000000000000000000000000000000..49aeea5296946efd483307eb0ced11e4b91f6acb GIT binary patch literal 12288 zcmeI2&1(}u7>B1Kiq)zpdY&do^<cKCf)J?Gk6Mbv29xMT2%FtWJ9Kt7?CjDO1&{UU zU*JhR2zt<iH$8Y0!J`GesaHR)et^H->}ndMO{JHXcj3uwW@q-@d1l_tA=4baHeKdt z3a1&4gYie_*m<8Z(~OMFD%L1%#}UR93o?-Aq@yAeil7i_Sy0hpcQ?z^p{0xy*r79L zMMeb1N@7?uS8A3c+(0@FUx$sRZG>YhCH1Ve;0N<9<u4eks_l8vl1g;nlg`Ni8R*x* zQ2o?caWrn7&kgg#hwA-yq;_P043GgbKnBPF86X2>fDCK{17UlB-Nra}q+^*#%YmLU z{YnKHAOmE843GgbKnBPF86X2>fDDiUGOz^=*e+wAcQZD)2M>?`|0mKhd^*6`d+-=M z0(ZeOa6l83KoR6Y4h(}`;Mab}R=^AJ5H!FgPy`=`82bQTf+wI2d@uvff}i^s`vKm8 zSKtm<26Ny#I0}w{?|T{h23~`w;2x-g5;z7{1{r$`o`DCz1rE3X#=uvM^$U0do`YN9 zCb$7EgT%!aoJ)FSGC&5%02v?yWMHEPc-N83b{$$lq=d0N-%Uz`9f?Xbd`~*4oVCtN ztx;7l-h7T_{iVhpNLQ4S3fbZ#%}%QkE;XMnR~psYWTjeSe>%h+`;Az$7k%w}OY4LT z(`K6z^PINTl96Ven0DH1b7C4Jyrx=joOar5&6peUTM0xv@Z<fgnAhs$>LJHBS~a^# zMT;w;kC&%T@bPh8>`Z%nAv^k=+Et-99C+;SdTV!&q1TL1Sd!(b63g>yvYL6ano-5G zMgM;*lX9Ue<tZ1oO)juFk7w$&O1*Y*`YMZ)#FasuKSqQ}&LE9As)Q5z!4kK%%iYLp zb3Y2(QP@eQ-gki1(sF#=lId^9pSMard%^I|<p`VQrLTEs-+TNhy|-#b+KHPKTx^cl VGCob;)c?5Ytn!<-;#nWVeglnSbT9w_ literal 0 HcmV?d00001 diff --git a/workflows/.snakemake/log/2020-04-24T185411.179436.snakemake.log b/workflows/.snakemake/log/2020-04-24T185411.179436.snakemake.log new file mode 100644 index 0000000..6f3df2a --- /dev/null +++ b/workflows/.snakemake/log/2020-04-24T185411.179436.snakemake.log @@ -0,0 +1,3 @@ +WorkflowError in line 6 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/BINNING_SNAKEFILE: +Config file CONFIG.yaml not found. + File "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/BINNING_SNAKEFILE", line 6, in <module> diff --git a/workflows/.snakemake/log/2020-04-24T185419.525822.snakemake.log b/workflows/.snakemake/log/2020-04-24T185419.525822.snakemake.log new file mode 100644 index 0000000..a3c2bb8 --- /dev/null +++ b/workflows/.snakemake/log/2020-04-24T185419.525822.snakemake.log @@ -0,0 +1,4 @@ +Building DAG of jobs... +MissingInputException in line 56 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/BINNING_SNAKEFILE: +Missing input files for rule depth_files: +results/mapping/flye/flye.bam diff --git a/workflows/.snakemake/log/2020-04-25T001612.138705.snakemake.log b/workflows/.snakemake/log/2020-04-25T001612.138705.snakemake.log new file mode 100644 index 0000000..68bdbd5 --- /dev/null +++ b/workflows/.snakemake/log/2020-04-25T001612.138705.snakemake.log @@ -0,0 +1,3 @@ +WorkflowError in line 8 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping: +Config file CONFIG.yaml not found. + File "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping", line 8, in <module> diff --git a/workflows/.snakemake/log/2020-04-25T001631.788549.snakemake.log b/workflows/.snakemake/log/2020-04-25T001631.788549.snakemake.log new file mode 100644 index 0000000..98e738f --- /dev/null +++ b/workflows/.snakemake/log/2020-04-25T001631.788549.snakemake.log @@ -0,0 +1,3 @@ +CreateRuleException in line 23 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping: +The name MAPPING is already used by another rule + File "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping", line 23, in <module> diff --git a/workflows/BINNING_RULES b/workflows/BINNING_RULES new file mode 100644 index 0000000..3bb7ad5 --- /dev/null +++ b/workflows/BINNING_RULES @@ -0,0 +1,125 @@ +# Rules for BINNING workflow, i.e. generating MAGs from assemblies + +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mm"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + + +##################################### +########### MAXBIN2 ################# +##################################### +rule maxbin2: + input: + bbref=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"), + bbam=os.path.join(RESULTS_DIR, "mapping/{sample}/{sample}.bam") + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/binning.yml" + output: + map=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}.sam"), + coverage=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_cov.txt"), + abund=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_abundance.txt"), + file=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/maxbin_output.001.fasta") +# mx=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/")) + shell: + """ + (date &&\ + samtools view -h -o {output.map} {input.bbam} &&\ + pileup.sh in={output.map} out={output.coverage} &&\ + awk '{{print $1"\\t"$2}}' {output.coverage} | grep -v '^#' > {output.abund} &&\ + run_MaxBin.pl -thread {threads} -contig {input.bbref} -out results/Binning/{wildcards.sample}/maxbin_output/maxbin_output -abund {output.abund} &&\ + touch maxbin.done &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +##################################### +########### METABAT2 ################ +##################################### + +rule depth_files: + input: + dep1=os.path.join(RESULTS_DIR, "mapping/{sample}/{sample}.bam") + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/binning.yml" + output: + depout=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_depth.txt") + shell: + """ + (date &&\ + jgi_summarize_bam_contig_depths --outputDepth {output.depout} {input.dep1} &&\ + touch depth_file.done &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule metabat2: + input: + met1=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"), + met2=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_depth.txt") + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/binning.yml" + output: + outdir=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/metabat_output/")) + shell: + """ + (date &&\ + metabat2 -i {input.met1} -a {input.met2} -o results/Binning/{wildcards.sample}/metabat_output/{wildcards.sample}.metabat -t {threads} -m 1500 -v --unbinned &&\ + touch metabat2.done &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +##################################### +########### DAS_Tool ################ +##################################### +rule scaffold_list: + input: + sca1=os.path.join(RESULTS_DIR, "Binning/{sample}/metabat_output/"), +# sca2=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/") + file=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/maxbin_output.001.fasta") + threads:config["threads"] +# conda:"/home/users/sbusi/apps/environments/base.yml" + conda: "dastool.yaml" + output: + scout1=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv"), + scout2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv") + shell: + """ + (date &&\ + export Rscript={config[Rscript]} &&\ + Fasta_to_Scaffolds2Bin.sh -i {input.sca1} -e fa > {output.scout1} &&\ + Fasta_to_Scaffolds2Bin.sh -i results/Binning/{wildcards.sample}/maxbin_output -e fasta > {output.scout2} &&\ + touch scaffold_list.done &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule DAS_Tool: + input: + da1=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv"), + da2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv"), + da3=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"), + db=config["dastool_database"] + threads:config["threads"] +# conda:"/home/users/sbusi/apps/environments/base.yml" + conda: "dastool.yaml" + params: + basename=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}") + output: + daout=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")), + dafile=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_proteins.faa"), + damfile=touch(os.path.join(RESULTS_DIR, "Binning/{sample}_das_tool.done")) + shell: + """ + (date &&\ + export Rscript={config[Rscript]} &&\ + DAS_Tool -i {input.da1},{input.da2} -c {input.da3} -o {params.basename} --search_engine diamond -l maxbin2,metabat2 --write_bins 1 --write_bin_evals 1 --threads {threads} --db_directory {input.db} --create_plots 1 &&\ + touch {output.damfile} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ diff --git a/workflows/CONFIG.yaml b/workflows/CONFIG.yaml new file mode 100644 index 0000000..8c9b8f9 --- /dev/null +++ b/workflows/CONFIG.yaml @@ -0,0 +1,148 @@ +data_dir: "data" +results_dir: "results" +db_dir: "dbs" +runs: + first: "20181106_1450_noselection_sizeselection" + second: "20181107_0906_same" + third: "20181108_0827_test" +barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"] +assemblers: ["flye"] +p7zip: + bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za" + threads: 4 +ont_fast5_api: + single_to_multi_fast5: + bin: "single_to_multi_fast5" + batch: 8000 + threads: 8 +flowcell: "FLO-MIN106" +kit: "SQK-LSK108" +#barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"] +barcodes: ["barcode07"] +guppy_cpu: + path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin" + bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller" + version: "cpu-3.1.5" + config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg" + cpu_threads: 28 +guppy_gpu: + path: "/home/users/sbusi/apps/ont-guppy/bin" + bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller" + version: "3.4.5+fb1fbfb" + config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg" + hac_config: "dna_r9.4.1_450bps_hac.cfg" + records_per_fastq: 8000 + chunk_size: 1000 + chunks_per_runner: 1000 + num_callers: 4 + runners_per_device: 2 + gpu_device: "cuda:0" + cpu_threads: 28 +guppy_barcoder: + path: "/home/users/sbusi/apps/ont-guppy/bin" + bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder" + version: "3.4.5+fb1fbfb" + records_per_fastq: 8000 + threads: 8 +nanostats: +#short_reads_prefix: "/scratch/users/claczny/ont/fecal_pilot/data/raw/short_reads" +short_reads_prefix: "/mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Apr/fastq" +metaT_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/metaT/2018_GDB" +#samples: ["Kapa1_MG_S18", "Kapa2_MG_S19", "NEB1_MG_S16", "NEB2_MG_S17", "NEBmod1_MG_Rashi_S22", "NEBmod2_MG_Rashi_S23"] +fastp: + min_length: 40 +minimap2: + threads: 16 +igc: + uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz" +hg38: + uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz" +genomecov: + bin: "bedtools genomecov" +compute_avg_coverage: + bin: "scripts/coverage.awk" +bwa: + threads: 24 + long_reads_index: + opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5" +samtools: + sort: + threads: 4 + chunk_size: "4G" + view: + threads: 4 +flye: + bin: "flye" + threads: 27 + genome_size: "1g" +operams: + bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl" + threads: 28 +megahit: + threads: 28 +nonpareil: + memory: 4096 + threads: 14 +medaka: + threads: 28 +racon: + threads: 28 +rebaler: + threads: 28 +diamond: + threads: 28 + db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd" +metaspades: + threads: 28 +mmseq2: + threads: 24 +# Define sample names +# samples: ["flye", "megahit", "metaspades"] +# samples: ["flye", "megahit"] +samples: ["metaspades_hybrid"] +#binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"] +binning_samples: ["megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"] + +# Hybrid assembler +hybrid_assembler: "metaspades_hybrid" + +# Directory where fastq files are +#data_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning" + +# Directory to save the output to +#results_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning" + +# Number of cpus or threads to use +threads: 28 + +# Path to the the 140GB Kraken2 database +kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/" + +# Path to DAS_Tool +DAS_Tool: + path: "/home/users/sbusi/apps/DAS_Tool-master" + bin: "/home/users/sbusi/apps/DAS_Tool-master/src/" + +# Path to DAS_Tool database +dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/" + +# Mapping options +bwa: + threads: 24 + long_reads_index: + opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5" +samtools: + sort: + threads: 4 + chunk_size: "4G" + view: + threads: 4 +minimap2: + threads: 24 + +# Path to GTDBTK database +GTDBTK: + DATA: "/home/users/sbusi/apps/db/gtdbtk/release89" + +# Rscript path +Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/" diff --git a/workflows/MAPPING_RULES b/workflows/MAPPING_RULES new file mode 100644 index 0000000..7d9c0b3 --- /dev/null +++ b/workflows/MAPPING_RULES @@ -0,0 +1,151 @@ +# Rule for running the mapping "workflow" for ONT analsyes + +#shell.executable("/bin/bash") +#shell.prefix("source ~/.bashrc; ") +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mmi"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + + +####################### +## Mapping to hybrid ## +####################### +rule build_hybrid_bwa_index: + input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER) + output: +# done=touch("bwa_index.done") + index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.amb"), + index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.ann"), + index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwt"), + index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.pac"), + index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.sa") + params: + prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}") + log: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwa_index") + conda: "envs/bwa.yaml" + shell: """ + (date &&\ + bwa index {input} -p {params.prefix} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule hybrid_map_short_reads_to_hybrid_contigs_w_bwa_mem: + input: + query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"), + query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"), + index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"), + index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"), + index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"), + index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"), + index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"), + index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER) + output: + os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}.bam") + params: + index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"), + out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}") + log: os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/sr_contigs.bwa_mem_samtools") + conda: "envs/bwa.yaml" + threads: + config["bwa"]["threads"] +# wildcard_constraints: +# sample='\w+' + shell: """ + (date &&\ + bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule hybrid_map_long_reads_to_hybrid_contigs_w_bwa_mem: + input: + query_lr=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES), + index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"), + index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"), + index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"), + index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"), + index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"), + index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER) + output: + os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}.bam") + params: + index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"), + out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}"), + log: os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/lr_contigs.bwa_mem_samtools") + conda: "envs/bwa.yaml" + threads: config["bwa"]["threads"] +# wildcard_constraints: +# sample='\w+' + shell: """ + (date &&\ + bwa mem -x ont2d -t {threads} {params.index_prefix} {input.query_lr} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule hybrid_build_minimap2_long_read_contigs_ont_index: + input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER) + output: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi") + conda: "envs/minimap2.yaml" + shell: + """ + date + minimap2 -x map-ont -d {output} {input} + date + """ + +rule hybrid_map_long_reads_to_hybrid_contigs_w_minimap2: + input: + query=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode="barcode07"), + index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"), + index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER) + output: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}.bam") + params: + prefix=os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}") + conda: "envs/minimap2.yaml" + threads: config["minimap2"]["threads"] + log: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/metaspades.minimap2_samtools") + shell: """ + (date &&\ + minimap2 -a -t {threads} {input.index} {input.query} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ +rule hybrid_map_short_reads_to_hybrid_contigs_w_minimap2: + input: + query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"), + query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"), + index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"), + index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER) + output: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}.bam") + params: + prefix=os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}") + conda: "envs/minimap2.yaml" + threads: config["minimap2"]["threads"] + log: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/metaspades.minimap2_samtools") + shell: + """ + (date &&\ + minimap2 -a -t {threads} {input.index} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule hybrid_merge_bam_files_for_metaspades: + input: + bam_sr=os.path.join(RESULTS_DIR, "mapping/{mapper}_sr_{hybrid_assembler}/{mapper}_sr_{hybrid_assembler}.bam"), + bam_lr=os.path.join(RESULTS_DIR, "mapping/{mapper}_lr_{hybrid_assembler}/{mapper}_lr_{hybrid_assembler}.bam") + output: os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam") + conda: "/home/users/sbusi/apps/environments/binning.yml" + shell: + """ + (date &&\ + samtools merge {output} {input.bam_sr} {input.bam_lr} &&\ + date) + """ diff --git a/workflows/METAT_RULES b/workflows/METAT_RULES new file mode 100644 index 0000000..bb9f272 --- /dev/null +++ b/workflows/METAT_RULES @@ -0,0 +1,120 @@ +# Rules for mapping the metaT reads to the different assemblies + +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mmi"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + + +########### +## metaT ## +########### +# Preprocess the metaT reads using fastp +rule run_fastp_on_metaT_reads: + input: + r1=os.path.join(config["metaT_prefix"], "{metaT_sample}_R1_001.fastq.gz"), + r2=os.path.join(config["metaT_prefix"], "{metaT_sample}_R2_001.fastq.gz") + output: + r1=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_R1_001.fastp.fq.gz"), + r2=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_R2_001.fastp.fq.gz"), + html=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.html"), + json=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.json") + log: os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp") + conda: "envs/fastp.yaml" + shell: + """ + (date &&\ + fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule run_fastqc_on_preprocessed_metaT_reads: + input: os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_{suffix}.fq.gz"), + output: + html=os.path.join(RESULTS_DIR, "qc/metaT/fastqc/{metaT_sample}/{metaT_sample}_{suffix}.fastqc.html"), + zip=os.path.join(RESULTS_DIR, "qc/metaT/fastqc/{metaT_sample}/{metaT_sample}_{suffix}.fastqc.zip") # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename + conda: "envs/fastqc.yaml" + script: "scripts/run_fastqc.py" # Use a separate script rather than a simple call to FASTQC to overcome possible race conditions when running in parallel (https://bitbucket.org/snakemake/snakemake-wrappers/raw/7f5e01706728e32d52aa413f213e950f50bf4129/bio/fastqc/wrapper.py) + + +rule map_metaT_reads_to_long_read_contigs_w_bwa_mem: + input: + query_r1=rules.run_fastp_on_metaT_reads.output.r1, + query_r2=rules.run_fastp_on_metaT_reads.output.r2, + index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.amb"), + index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.ann"), + index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.bwt"), + index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.pac"), + index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.sa"), + index_fa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.fna") + output: os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bam") + params: + index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly"), + out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x_{barcode}_contigs") + conda: "envs/bwa.yaml" + threads: config["bwa"]["threads"] + log: os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bwa_mem_samtools") + shell: + """ + (date &&\ + bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + + +rule map_metaT_reads_to_short_read_contigs_w_bwa_mem: + input: + query_r1=rules.run_fastp_on_metaT_reads.output.r1, + query_r2=rules.run_fastp_on_metaT_reads.output.r2, + index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.amb"), + index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.ann"), + index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.bwt"), + index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.pac"), + index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.sa"), + index_fa=os.path.join(RESULTS_DIR, "assembly/megahit/{sr_sample}/final.contigs.fna") + output: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bam") + params: + index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs"), + out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs") + conda: "envs/bwa.yaml" + threads: config["bwa"]["threads"] + log: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bwa_mem_samtools") + shell: + """ + (date &&\ + bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + + +rule map_metaT_reads_to_hybrid_contigs_w_bwa_mem: + input: + query_r1=rules.run_fastp_on_metaT_reads.output.r1, + query_r2=rules.run_fastp_on_metaT_reads.output.r2, + index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.amb"), + index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.ann"), + index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.bwt"), + index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.pac"), + index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.sa"), + index_fa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna") + output: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bam") + params: + index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs"), + out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs") + log: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bwa_mem_samtools") + conda: "envs/bwa.yaml" + threads: config["bwa"]["threads"] + shell: + """ + (date &&\ + bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ diff --git a/workflows/MMSEQ_RULES b/workflows/MMSEQ_RULES new file mode 100644 index 0000000..e02e49d --- /dev/null +++ b/workflows/MMSEQ_RULES @@ -0,0 +1,110 @@ +# For running the MMSEQ2 comparison of proteins after assemblies are run through prokka/prodigal + +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mm"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + + +############################# +######## MMSEQ2 ############ +############################# +rule mmseq2_database: + input: + int1=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr/merged/{barcode}/assembly.faa"), assembler="flye", barcode=BARCODES), + int2=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{sample}/final.contigs.faa"), assembler="megahit", sample=SAMPLES), + int3=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr_{barcode}-sr_{sample}/contigs.faa"), assembler="metaspades_hybrid", barcode=BARCODES, sample=SAMPLES) +# expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{prefix}.faa"), assembler=["flye", "megahit", "metaspades_hybrid"]) + output: + dat1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"), + dat2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"), + dat3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid") + log: os.path.join(RESULTS_DIR, "annotation/mmseq2/database.mmseq2.log") + conda: "cd-hit.yaml" + shell: + """ + (date &&\ + mmseqs createdb {input.int1} {output.dat1} &&\ + mmseqs createdb {input.int2} {output.dat2} &&\ + mmseqs createdb {input.int3} {output.dat3} &&\ + date) &> >(tee {log}) + """ + +rule mmseq2_compare: + input: + mm1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"), + mm2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"), + mm3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid") + output: + mo1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"), + mo2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"), + mo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh") + log: os.path.join(RESULTS_DIR, "annotation/mmseq2/compare.mmseq2.log") + threads: config["mmseq2"]["threads"] + conda: "cd-hit.yaml" + shell: + """ + (date &&\ + mmseqs rbh {input.mm1} {input.mm2} {output.mo1} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\ + mmseqs rbh {input.mm1} {input.mm3} {output.mo2} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\ + mmseqs rbh {input.mm2} {input.mm3} {output.mo3} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\ + date) &> >(tee {log}) + """ + +rule mmseq2_m8_format: + input: + fo1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_db"), + fo2=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_db"), + fo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/metaspades_hybrid_db"), + fo4=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"), + fo5=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"), + fo6=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh") + output: + form1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit.m8"), + form2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid.m8"), + form3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8") + log: os.path.join(RESULTS_DIR, "annotation/mmseq2/convertalis.mmseq2.log") + conda: "cd-hit.yaml" + shell: + """ + (date &&\ + mmseqs convertalis {input.fo1} {input.fo2} {input.fo4} {output.form1} &&\ + mmseqs convertalis {input.fo1} {input.fo3} {input.fo5} {output.form2} &&\ + mmseqs convertalis {input.fo2} {input.fo3} {input.fo6} {output.form3} &&\ + date) &> >(tee {log}) + """ + +rule prepare_plot_files: + output: + touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done")) + log: os.path.join(RESULTS_DIR, "annotation/mmseq2/files_ready.mmseq2.log") + shell: + """ + (date &&\ + ./prepare_plot_files.sh &&\ + date) &> >(tee {log}) + """ + +rule mmseq2_plots: + input: + plot1=os.path.join(RESULTS_DIR, "annotation/mmseq2/overlap_sizes.txt"), + plot2=os.path.join(RESULTS_DIR, "annotation/mmseq2/total.txt") + output: + touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/upset_plots.done")) + log: os.path.join(RESULTS_DIR, "annotation/mmseq2/plot.mmseq2.log") + conda: "renv.yaml" + shell: + """ + (date &&\ + Rscript mmseq_plots.R {input.plot1} {input.plot2} &&\ + date) &> >(tee {log}) + """ diff --git a/workflows/TAXONOMY_RULES b/workflows/TAXONOMY_RULES new file mode 100644 index 0000000..e8e1620 --- /dev/null +++ b/workflows/TAXONOMY_RULES @@ -0,0 +1,72 @@ +# For running the contamination check and taxonomy on the generated MAGs + +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mm"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + + +##################################### +######## GTDBTK & CHECKM ############ +##################################### +rule gtdbtk: + input: + gen=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins") + threads:config["threads"] + conda:"gtdbtk.yaml" + params: + outdir=os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}") + output: +# gtdb=directory(os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}")), + gtdbfile=os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv") + wildcard_constraints: + sample='\w+' + shell: + """ + (date &&\ + export GTDBTK_DATA_PATH={config[GTDBTK][DATA]} &&\ + gtdbtk classify_wf --cpus {threads} -x fa --genome_dir {input.gen} --out_dir {params.outdir} &&\ + touch gtdbtk.done &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule checkm: + input: + chkm=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins") + threads:config["threads"] + conda:"checkm.yml" + output: + chkmdir=directory(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}/lineage.ms")), + chkout=os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}_output.txt") + shell: + """ + (date &&\ + checkm lineage_wf -r -t {threads} -x fa {input.chkm} {output.chkmdir} | tee {output.chkout} &&\ + touch checkm.done &&\ + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ + +rule plot_checkm: + input: + pl1=os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}"), + pl2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins") + threads:config["threads"] + conda:"/home/users/sbusi/apps/environments/checkm.yml" + output: + plt=directory(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}/plots")) + shell: + """ + date + checkm bin_qa_plot --image_type png --width 8 --dpi 200 --font_size 8 -x fa -q {input.pl1} {input.pl2} {output.plt} + touch plot_checkm.done + date) 2> >(tee {log}.stderr) > >(tee {log}.stdout) + """ diff --git a/workflows/binning b/workflows/binning new file mode 100644 index 0000000..84c222c --- /dev/null +++ b/workflows/binning @@ -0,0 +1,28 @@ +# For running the BINNING "workflow" + +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mmi"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + + +# specify which rules to run +include: + 'BINNING_RULES' + +# Rule all for BINNING +rule BINNING: + input: + expand(os.path.join(RESULTS_DIR, "assembly/{sample}.fa"), sample=BINNING_SAMPLES), + expand(os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_proteins.faa"), sample=BINNING_SAMPLES), + output: + touch('binning_for_ont.done') diff --git a/workflows/mapping b/workflows/mapping new file mode 100644 index 0000000..5937936 --- /dev/null +++ b/workflows/mapping @@ -0,0 +1,29 @@ +# Workflow for running mapping steps of different assemblers, and mappers for the Binning workflow + +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mmi"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + +# specify which rules to run +include: + 'MAPPING_RULES' + +# Rule all for mapping +rule MAPPING: + input: + expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"), hybrid_assembler=HYBRID_ASSEMBLER), + expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_{reads}_{hybrid_assembler}/{mapper}_{reads}_{hybrid_assembler}.bam"), mapper=MAPPERS, reads=["sr", "lr"], hybrid_assembler=HYBRID_ASSEMBLER), + expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"), hybrid_assembler=HYBRID_ASSEMBLER), + expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam"), mapper=MAPPERS, hybrid_assembler=HYBRID_ASSEMBLER) + output: + touch('mapping_for_binning.done') diff --git a/workflows/metat b/workflows/metat new file mode 100644 index 0000000..5daa5e5 --- /dev/null +++ b/workflows/metat @@ -0,0 +1,33 @@ +# Workflow for running mapping metaT reads to different assemblies using different mappers for the Binning workflow + +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mmi"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + +# specify which rules to run +include: + 'METAT_RULES' + +# Rule all for mappinng metaT reads to all assemblies +rule METAT: + input: + expand(os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.{report_type}"), metaT_sample="GDB_2018_metaT", report_type=["html", "json"]), +# expand(os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}-x-{barcode}.bam"), metaT_sample="GDB_2018_metaT", barcode=BARCODES), + expand(os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bam"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", assembler="megahit"), + expand(os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bam"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", barcode=BARCODES, assembler="metaspades_hybrid"), + expand(os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bam"), metaT_sample="GDB_2018_metaT", barcode=BARCODES, assembler=ASSEMBLERS), + expand(os.path.join(RESULTS_DIR, "genomecov/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", assembler="megahit"), + expand(os.path.join(RESULTS_DIR, "genomecov/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.avg_cov.txt"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", barcode=BARCODES, assembler="metaspades_hybrid"), + expand(os.path.join(RESULTS_DIR, "genomecov/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.avg_cov.txt"), metaT_sample="GDB_2018_metaT", barcode=BARCODES, assembler=ASSEMBLERS) + output: + touch('metaT_mapping_for_ONT.done') diff --git a/workflows/mmseq b/workflows/mmseq new file mode 100644 index 0000000..12ec7dd --- /dev/null +++ b/workflows/mmseq @@ -0,0 +1,36 @@ +# For running the MMSEQ2 "workflow" + +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mmi"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + + +# specify which rules to run +include: + 'MMSEQ_RULES' + +# Rule all for running the MMSEQ2 analyses on the proteins after assembly +rule MMSEQ: + input: + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler=["flye", "megahit", "metaspades_hybrid"]), + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh")), + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh")), + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")), + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit.m8")), + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid.m8")), + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8")), + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done")), +# expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done")), + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/upset_plots.done")) + output: + touch('mmseq_comparison_for_ont.done') diff --git a/workflows/taxonomy b/workflows/taxonomy new file mode 100644 index 0000000..bff6dd7 --- /dev/null +++ b/workflows/taxonomy @@ -0,0 +1,28 @@ +# For running the TAXONOMY "workflow" + +import os +from tempfile import TemporaryDirectory + +configfile: "CONFIG.yaml" +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +ASSEMBLERS=config["assemblers"] +MAPPERS=["bwa", "mmi"] +SAMPLES=config["samples"] +BINNING_SAMPLES=config["binning_samples"] +HYBRID_ASSEMBLER=config["hybrid_assembler"] + + +# specify which rules to run +include: + 'TAXONOMY_RULES' + +# Rule all for TAXONOMY +rule BINNING: + input: + expand(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}_output.txt"), sample=BINNING_SAMPLES), + expand(os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv"), sample=BINNING_SAMPLES) + output: + touch('taxonomy_for_ont.done') diff --git a/workflows/workflows_snakefile b/workflows/workflows_snakefile new file mode 100644 index 0000000..3891453 --- /dev/null +++ b/workflows/workflows_snakefile @@ -0,0 +1,47 @@ +# include global functions +include: + "workflow/rules/function.definitions.smk" + +# include configuration file +include: + "workflow/rules/ini/config.smk" + +# set working directory and dump output +workdir: + OUTPUTDIR + + +# Single omics MG workflow +elif MG: + if 'preprocessing' in IMP_STEPS: + if len(MG) == 2: + include: + "workflow/rules/modules/single_omics/mg/Preprocessing.smk" + if len(MG) == 1: + include: + "workflow/rules/modules/single_omics/mg/PreprocessingSE.smk" + + if 'assembly' in IMP_STEPS: + include: + "workflow/rules/modules/single_omics/mg/Assembly.smk" + + if 'analysis' in IMP_STEPS: + include: + "workflow/rules/modules/single_omics/mg/Analysis.smk" + + if 'taxonomy' in IMP_STEPS: + include: + "workflow/rules/modules/single_omics/mg/Taxonomy.smk" + + if 'binning' in IMP_STEPS: + include: + "workflow/rules/modules/single_omics/mg/Binning.smk" + +# master command +rule ALL: + input: + inputs_all + output: + touch('status/all.done') + + -- GitLab