From dc8a6b4f63f83862f1ceb16d3c9e4ad1f8fc847d Mon Sep 17 00:00:00 2001
From: "Susheel Bhanu Busi susheel.busi@uni.lu" <susheel.busi@uni.lu>
Date: Sat, 25 Apr 2020 02:03:49 +0200
Subject: [PATCH] created modular parts of the SNAKEFILE for ease of handling
 and downstream processes

---
 workflows/.Snakefile.swp                      | Bin 0 -> 12288 bytes
 .../2020-04-24T185411.179436.snakemake.log    |   3 +
 .../2020-04-24T185419.525822.snakemake.log    |   4 +
 .../2020-04-25T001612.138705.snakemake.log    |   3 +
 .../2020-04-25T001631.788549.snakemake.log    |   3 +
 workflows/BINNING_RULES                       | 125 +++++++++++++++
 workflows/CONFIG.yaml                         | 148 +++++++++++++++++
 workflows/MAPPING_RULES                       | 151 ++++++++++++++++++
 workflows/METAT_RULES                         | 120 ++++++++++++++
 workflows/MMSEQ_RULES                         | 110 +++++++++++++
 workflows/TAXONOMY_RULES                      |  72 +++++++++
 workflows/binning                             |  28 ++++
 workflows/mapping                             |  29 ++++
 workflows/metat                               |  33 ++++
 workflows/mmseq                               |  36 +++++
 workflows/taxonomy                            |  28 ++++
 workflows/workflows_snakefile                 |  47 ++++++
 17 files changed, 940 insertions(+)
 create mode 100644 workflows/.Snakefile.swp
 create mode 100644 workflows/.snakemake/log/2020-04-24T185411.179436.snakemake.log
 create mode 100644 workflows/.snakemake/log/2020-04-24T185419.525822.snakemake.log
 create mode 100644 workflows/.snakemake/log/2020-04-25T001612.138705.snakemake.log
 create mode 100644 workflows/.snakemake/log/2020-04-25T001631.788549.snakemake.log
 create mode 100644 workflows/BINNING_RULES
 create mode 100644 workflows/CONFIG.yaml
 create mode 100644 workflows/MAPPING_RULES
 create mode 100644 workflows/METAT_RULES
 create mode 100644 workflows/MMSEQ_RULES
 create mode 100644 workflows/TAXONOMY_RULES
 create mode 100644 workflows/binning
 create mode 100644 workflows/mapping
 create mode 100644 workflows/metat
 create mode 100644 workflows/mmseq
 create mode 100644 workflows/taxonomy
 create mode 100644 workflows/workflows_snakefile

diff --git a/workflows/.Snakefile.swp b/workflows/.Snakefile.swp
new file mode 100644
index 0000000000000000000000000000000000000000..49aeea5296946efd483307eb0ced11e4b91f6acb
GIT binary patch
literal 12288
zcmeI2&1(}u7>B1Kiq)zpdY&do^<cKCf)J?Gk6Mbv29xMT2%FtWJ9Kt7?CjDO1&{UU
zU*JhR2zt<iH$8Y0!J`GesaHR)et^H->}ndMO{JHXcj3uwW@q-@d1l_tA=4baHeKdt
z3a1&4gYie_*m<8Z(~OMFD%L1%#}UR93o?-Aq@yAeil7i_Sy0hpcQ?z^p{0xy*r79L
zMMeb1N@7?uS8A3c+(0@FUx$sRZG>YhCH1Ve;0N<9<u4eks_l8vl1g;nlg`Ni8R*x*
zQ2o?caWrn7&kgg#hwA-yq;_P043GgbKnBPF86X2>fDCK{17UlB-Nra}q+^*#%YmLU
z{YnKHAOmE843GgbKnBPF86X2>fDDiUGOz^=*e+wAcQZD)2M>?`|0mKhd^*6`d+-=M
z0(ZeOa6l83KoR6Y4h(}`;Mab}R=^AJ5H!FgPy`=`82bQTf+wI2d@uvff}i^s`vKm8
zSKtm<26Ny#I0}w{?|T{h23~`w;2x-g5;z7{1{r$`o`DCz1rE3X#=uvM^$U0do`YN9
zCb$7EgT%!aoJ)FSGC&5%02v?yWMHEPc-N83b{$$lq=d0N-%Uz`9f?Xbd`~*4oVCtN
ztx;7l-h7T_{iVhpNLQ4S3fbZ#%}%QkE;XMnR~psYWTjeSe>%h+`;Az$7k%w}OY4LT
z(`K6z^PINTl96Ven0DH1b7C4Jyrx=joOar5&6peUTM0xv@Z<fgnAhs$>LJHBS~a^#
zMT;w;kC&%T@bPh8>`Z%nAv^k=+Et-99C+;SdTV!&q1TL1Sd!(b63g>yvYL6ano-5G
zMgM;*lX9Ue<tZ1oO)juFk7w$&O1*Y*`YMZ)#FasuKSqQ}&LE9As)Q5z!4kK%%iYLp
zb3Y2(QP@eQ-gki1(sF#=lId^9pSMard%^I|<p`VQrLTEs-+TNhy|-#b+KHPKTx^cl
VGCob;)c?5Ytn!<-;#nWVeglnSbT9w_

literal 0
HcmV?d00001

diff --git a/workflows/.snakemake/log/2020-04-24T185411.179436.snakemake.log b/workflows/.snakemake/log/2020-04-24T185411.179436.snakemake.log
new file mode 100644
index 0000000..6f3df2a
--- /dev/null
+++ b/workflows/.snakemake/log/2020-04-24T185411.179436.snakemake.log
@@ -0,0 +1,3 @@
+WorkflowError in line 6 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/BINNING_SNAKEFILE:
+Config file CONFIG.yaml not found.
+  File "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/BINNING_SNAKEFILE", line 6, in <module>
diff --git a/workflows/.snakemake/log/2020-04-24T185419.525822.snakemake.log b/workflows/.snakemake/log/2020-04-24T185419.525822.snakemake.log
new file mode 100644
index 0000000..a3c2bb8
--- /dev/null
+++ b/workflows/.snakemake/log/2020-04-24T185419.525822.snakemake.log
@@ -0,0 +1,4 @@
+Building DAG of jobs...
+MissingInputException in line 56 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/BINNING_SNAKEFILE:
+Missing input files for rule depth_files:
+results/mapping/flye/flye.bam
diff --git a/workflows/.snakemake/log/2020-04-25T001612.138705.snakemake.log b/workflows/.snakemake/log/2020-04-25T001612.138705.snakemake.log
new file mode 100644
index 0000000..68bdbd5
--- /dev/null
+++ b/workflows/.snakemake/log/2020-04-25T001612.138705.snakemake.log
@@ -0,0 +1,3 @@
+WorkflowError in line 8 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping:
+Config file CONFIG.yaml not found.
+  File "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping", line 8, in <module>
diff --git a/workflows/.snakemake/log/2020-04-25T001631.788549.snakemake.log b/workflows/.snakemake/log/2020-04-25T001631.788549.snakemake.log
new file mode 100644
index 0000000..98e738f
--- /dev/null
+++ b/workflows/.snakemake/log/2020-04-25T001631.788549.snakemake.log
@@ -0,0 +1,3 @@
+CreateRuleException in line 23 of /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping:
+The name MAPPING is already used by another rule
+  File "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/workflows/mapping", line 23, in <module>
diff --git a/workflows/BINNING_RULES b/workflows/BINNING_RULES
new file mode 100644
index 0000000..3bb7ad5
--- /dev/null
+++ b/workflows/BINNING_RULES
@@ -0,0 +1,125 @@
+# Rules for BINNING workflow, i.e. generating MAGs from assemblies
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+BARCODES=config["barcodes"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mm"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+
+#####################################
+########### MAXBIN2 #################
+#####################################
+rule maxbin2:
+    input:
+          bbref=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"),
+          bbam=os.path.join(RESULTS_DIR, "mapping/{sample}/{sample}.bam")
+    threads:config["threads"]
+    conda:"/home/users/sbusi/apps/environments/binning.yml"
+    output:
+           map=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}.sam"),
+           coverage=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_cov.txt"),
+           abund=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_abundance.txt"),
+           file=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/maxbin_output.001.fasta")
+#        mx=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/"))
+    shell:
+           """
+            (date &&\
+            samtools view -h -o {output.map} {input.bbam} &&\
+            pileup.sh in={output.map} out={output.coverage} &&\
+            awk '{{print $1"\\t"$2}}' {output.coverage} | grep -v '^#' > {output.abund} &&\
+            run_MaxBin.pl -thread {threads} -contig {input.bbref} -out results/Binning/{wildcards.sample}/maxbin_output/maxbin_output -abund {output.abund} &&\
+            touch maxbin.done &&\
+            date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+            """
+
+#####################################
+########### METABAT2 ################
+#####################################
+
+rule depth_files:
+    input:
+          dep1=os.path.join(RESULTS_DIR, "mapping/{sample}/{sample}.bam")
+    threads:config["threads"]
+    conda:"/home/users/sbusi/apps/environments/binning.yml"
+    output:
+          depout=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_depth.txt")
+    shell:
+           """
+            (date &&\
+            jgi_summarize_bam_contig_depths --outputDepth {output.depout} {input.dep1} &&\
+            touch depth_file.done &&\
+            date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+            """
+
+rule metabat2:
+    input:
+           met1=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"),
+           met2=os.path.join(RESULTS_DIR, "Binning/{sample}/{sample}_depth.txt")
+    threads:config["threads"]
+    conda:"/home/users/sbusi/apps/environments/binning.yml"
+    output:
+           outdir=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/metabat_output/"))
+    shell:
+           """
+            (date &&\
+            metabat2 -i {input.met1} -a {input.met2} -o results/Binning/{wildcards.sample}/metabat_output/{wildcards.sample}.metabat -t {threads} -m 1500 -v --unbinned &&\
+            touch metabat2.done &&\
+            date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+            """
+
+#####################################
+########### DAS_Tool ################
+#####################################
+rule scaffold_list:
+    input:
+        sca1=os.path.join(RESULTS_DIR, "Binning/{sample}/metabat_output/"),
+#       sca2=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/")
+        file=os.path.join(RESULTS_DIR, "Binning/{sample}/maxbin_output/maxbin_output.001.fasta")
+    threads:config["threads"]
+#    conda:"/home/users/sbusi/apps/environments/base.yml"
+    conda: "dastool.yaml"
+    output:
+            scout1=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv"),
+            scout2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv")
+    shell:
+            """
+            (date &&\
+            export Rscript={config[Rscript]} &&\
+            Fasta_to_Scaffolds2Bin.sh -i {input.sca1} -e fa > {output.scout1} &&\
+            Fasta_to_Scaffolds2Bin.sh -i results/Binning/{wildcards.sample}/maxbin_output -e fasta > {output.scout2} &&\
+            touch scaffold_list.done &&\
+            date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+            """
+
+rule DAS_Tool:
+    input:
+            da1=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_metabat.scaffolds2bin.tsv"),
+            da2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_maxbin.scaffolds2bin.tsv"),
+            da3=os.path.join(RESULTS_DIR, "assembly/{sample}.fa"),
+            db=config["dastool_database"]
+    threads:config["threads"]
+#    conda:"/home/users/sbusi/apps/environments/base.yml"
+    conda: "dastool.yaml"
+    params:
+            basename=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}")
+    output:
+            daout=directory(os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")),
+            dafile=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_proteins.faa"),
+            damfile=touch(os.path.join(RESULTS_DIR, "Binning/{sample}_das_tool.done"))
+    shell:
+            """
+            (date &&\
+            export Rscript={config[Rscript]} &&\
+            DAS_Tool -i {input.da1},{input.da2} -c {input.da3} -o {params.basename} --search_engine diamond -l maxbin2,metabat2 --write_bins 1 --write_bin_evals 1 --threads {threads} --db_directory {input.db} --create_plots 1 &&\
+            touch {output.damfile} &&\
+            date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+            """
diff --git a/workflows/CONFIG.yaml b/workflows/CONFIG.yaml
new file mode 100644
index 0000000..8c9b8f9
--- /dev/null
+++ b/workflows/CONFIG.yaml
@@ -0,0 +1,148 @@
+data_dir: "data"
+results_dir: "results"
+db_dir: "dbs"
+runs:
+    first: "20181106_1450_noselection_sizeselection"
+    second: "20181107_0906_same"
+    third: "20181108_0827_test"
+barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"]
+assemblers: ["flye"]
+p7zip:
+    bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
+    threads: 4
+ont_fast5_api:
+    single_to_multi_fast5:
+        bin: "single_to_multi_fast5"
+        batch: 8000
+        threads: 8
+flowcell: "FLO-MIN106"
+kit: "SQK-LSK108"
+#barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"]
+barcodes: ["barcode07"]
+guppy_cpu:
+    path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
+    bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
+    version: "cpu-3.1.5"
+    config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
+    cpu_threads: 28
+guppy_gpu:
+    path: "/home/users/sbusi/apps/ont-guppy/bin"
+    bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
+    version: "3.4.5+fb1fbfb"
+    config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
+    hac_config: "dna_r9.4.1_450bps_hac.cfg"
+    records_per_fastq: 8000
+    chunk_size: 1000
+    chunks_per_runner: 1000
+    num_callers: 4
+    runners_per_device: 2
+    gpu_device: "cuda:0"
+    cpu_threads: 28
+guppy_barcoder:
+    path: "/home/users/sbusi/apps/ont-guppy/bin"
+    bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
+    version: "3.4.5+fb1fbfb"
+    records_per_fastq: 8000
+    threads: 8
+nanostats:
+#short_reads_prefix: "/scratch/users/claczny/ont/fecal_pilot/data/raw/short_reads"
+short_reads_prefix: "/mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Apr/fastq"
+metaT_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/metaT/2018_GDB"
+#samples: ["Kapa1_MG_S18", "Kapa2_MG_S19", "NEB1_MG_S16", "NEB2_MG_S17", "NEBmod1_MG_Rashi_S22", "NEBmod2_MG_Rashi_S23"]
+fastp:
+    min_length: 40
+minimap2:
+    threads: 16
+igc:
+    uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
+hg38:
+    uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
+genomecov:
+    bin: "bedtools genomecov"
+compute_avg_coverage:
+    bin: "scripts/coverage.awk"
+bwa:
+    threads: 24
+    long_reads_index:
+        opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
+samtools:
+    sort:
+        threads: 4
+        chunk_size: "4G"
+    view:
+        threads: 4
+flye:
+    bin: "flye"
+    threads: 27
+    genome_size: "1g"
+operams:
+    bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
+    threads: 28
+megahit:
+    threads: 28
+nonpareil:
+    memory: 4096
+    threads: 14
+medaka:
+    threads: 28
+racon:
+    threads: 28 
+rebaler:
+    threads: 28
+diamond:
+    threads: 28
+    db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
+metaspades:
+    threads: 28
+mmseq2:
+    threads: 24
+# Define sample names
+# samples: ["flye", "megahit", "metaspades"]
+# samples: ["flye", "megahit"]
+samples: ["metaspades_hybrid"]
+#binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
+binning_samples: ["megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
+
+# Hybrid assembler
+hybrid_assembler: "metaspades_hybrid"
+
+# Directory where fastq files are
+#data_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
+
+# Directory to save the output to
+#results_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
+
+# Number of cpus or threads to use
+threads: 28
+
+# Path to the the 140GB Kraken2 database
+kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
+
+# Path to DAS_Tool
+DAS_Tool:
+        path: "/home/users/sbusi/apps/DAS_Tool-master"
+        bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
+
+# Path to DAS_Tool database
+dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
+
+# Mapping options
+bwa:
+    threads: 24
+    long_reads_index:
+        opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
+samtools:
+    sort:
+        threads: 4
+        chunk_size: "4G"
+    view:
+        threads: 4
+minimap2:
+    threads: 24
+
+# Path to GTDBTK database
+GTDBTK:
+    DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
+
+# Rscript path
+Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
diff --git a/workflows/MAPPING_RULES b/workflows/MAPPING_RULES
new file mode 100644
index 0000000..7d9c0b3
--- /dev/null
+++ b/workflows/MAPPING_RULES
@@ -0,0 +1,151 @@
+# Rule for running the mapping "workflow" for ONT analsyes
+
+#shell.executable("/bin/bash")
+#shell.prefix("source ~/.bashrc; ")
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+BARCODES=config["barcodes"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mmi"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+
+#######################
+## Mapping to hybrid ##
+#######################
+rule build_hybrid_bwa_index:
+    input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
+    output:
+#           done=touch("bwa_index.done")
+           index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.amb"),
+           index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.ann"),
+           index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwt"),
+           index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.pac"),
+           index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.sa")
+    params:
+           prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}")
+    log: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{prefix}.bwa_index")
+    conda: "envs/bwa.yaml"
+    shell: """
+           (date &&\
+           bwa index {input} -p {params.prefix} &&\
+           date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+           """
+
+rule hybrid_map_short_reads_to_hybrid_contigs_w_bwa_mem:
+    input:
+          query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"),
+          query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"),
+          index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"),
+          index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"),
+          index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"),
+          index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"),
+          index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"),
+          index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
+    output:
+          os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}.bam")
+    params:
+          index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"),
+          out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/bwa_sr_{hybrid_assembler}")
+    log: os.path.join(RESULTS_DIR, "mapping/bwa_sr_{hybrid_assembler}/sr_contigs.bwa_mem_samtools")
+    conda: "envs/bwa.yaml"
+    threads:
+          config["bwa"]["threads"]
+#    wildcard_constraints:
+#          sample='\w+'
+    shell: """
+           (date &&\
+           bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
+           date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+           """
+
+rule hybrid_map_long_reads_to_hybrid_contigs_w_bwa_mem:
+    input:
+          query_lr=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode=BARCODES),
+          index_amb=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.amb"),
+          index_ann=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.ann"),
+          index_bwt=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"),
+          index_pac=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.pac"),
+          index_sa=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.sa"),
+          index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
+    output:
+          os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}.bam")
+    params:
+          index_prefix=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades"),
+          out_prefix=os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/bwa_lr_{hybrid_assembler}"),
+    log:  os.path.join(RESULTS_DIR, "mapping/bwa_lr_{hybrid_assembler}/lr_contigs.bwa_mem_samtools")
+    conda: "envs/bwa.yaml"
+    threads: config["bwa"]["threads"]
+#    wildcard_constraints:
+#          sample='\w+'
+    shell: """
+          (date &&\
+          bwa mem -x ont2d -t {threads} {params.index_prefix} {input.query_lr} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
+          date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+          """
+
+rule hybrid_build_minimap2_long_read_contigs_ont_index:
+    input: expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
+    output: os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi")
+    conda: "envs/minimap2.yaml"
+    shell:
+           """
+            date
+            minimap2 -x map-ont -d {output} {input}
+            date
+            """
+
+rule hybrid_map_long_reads_to_hybrid_contigs_w_minimap2:
+    input:
+          query=expand(os.path.join(RESULTS_DIR, "basecalled/merged/{barcode}.fastq.gz"), barcode="barcode07"),
+          index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"),
+          index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
+    output: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}.bam")
+    params:
+          prefix=os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/mmi_lr_{hybrid_assembler}")
+    conda: "envs/minimap2.yaml"
+    threads: config["minimap2"]["threads"]
+    log: os.path.join(RESULTS_DIR, "mapping/mmi_lr_{hybrid_assembler}/metaspades.minimap2_samtools")
+    shell: """
+            (date &&\
+            minimap2 -a -t {threads} {input.index} {input.query} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output} &&\
+            date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+            """
+rule hybrid_map_short_reads_to_hybrid_contigs_w_minimap2:
+    input:
+          query_r1=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"),
+          query_r2=expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R2_001.fastp.fq.gz"), sr_sample="NEB2_MG_S17"),
+          index=os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"),
+          index_fa=expand(os.path.join(RESULTS_DIR, "assembly/{hybrid_assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fasta"), sr_sample="NEB2_MG_S17", barcode="barcode07", hybrid_assembler=HYBRID_ASSEMBLER)
+    output: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}.bam")
+    params:
+          prefix=os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/mmi_sr_{hybrid_assembler}")
+    conda: "envs/minimap2.yaml"
+    threads: config["minimap2"]["threads"]
+    log: os.path.join(RESULTS_DIR, "mapping/mmi_sr_{hybrid_assembler}/metaspades.minimap2_samtools")
+    shell:
+           """
+            (date &&\
+           minimap2 -a -t {threads} {input.index} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.prefix} > {output}
+           date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+            """
+
+rule hybrid_merge_bam_files_for_metaspades:
+    input:
+          bam_sr=os.path.join(RESULTS_DIR, "mapping/{mapper}_sr_{hybrid_assembler}/{mapper}_sr_{hybrid_assembler}.bam"),
+          bam_lr=os.path.join(RESULTS_DIR, "mapping/{mapper}_lr_{hybrid_assembler}/{mapper}_lr_{hybrid_assembler}.bam")
+    output: os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam")
+    conda: "/home/users/sbusi/apps/environments/binning.yml"
+    shell:
+          """
+          (date &&\
+          samtools merge {output} {input.bam_sr} {input.bam_lr} &&\
+          date)
+          """
diff --git a/workflows/METAT_RULES b/workflows/METAT_RULES
new file mode 100644
index 0000000..bb9f272
--- /dev/null
+++ b/workflows/METAT_RULES
@@ -0,0 +1,120 @@
+# Rules for mapping the metaT reads to the different assemblies
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mmi"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+
+###########
+## metaT ##
+###########
+# Preprocess the metaT reads using fastp
+rule run_fastp_on_metaT_reads:
+    input:
+        r1=os.path.join(config["metaT_prefix"], "{metaT_sample}_R1_001.fastq.gz"),
+        r2=os.path.join(config["metaT_prefix"], "{metaT_sample}_R2_001.fastq.gz")
+    output:
+        r1=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_R1_001.fastp.fq.gz"),
+        r2=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_R2_001.fastp.fq.gz"),
+        html=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.html"),
+        json=os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.json")
+    log: os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp")
+    conda: "envs/fastp.yaml"
+    shell:
+        """
+        (date &&\
+        fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
+
+rule run_fastqc_on_preprocessed_metaT_reads:
+    input: os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}_{suffix}.fq.gz"),
+    output:
+        html=os.path.join(RESULTS_DIR, "qc/metaT/fastqc/{metaT_sample}/{metaT_sample}_{suffix}.fastqc.html"),
+        zip=os.path.join(RESULTS_DIR, "qc/metaT/fastqc/{metaT_sample}/{metaT_sample}_{suffix}.fastqc.zip") # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename
+    conda: "envs/fastqc.yaml"
+    script: "scripts/run_fastqc.py" # Use a separate script rather than a simple call to FASTQC to overcome possible race conditions when running in parallel (https://bitbucket.org/snakemake/snakemake-wrappers/raw/7f5e01706728e32d52aa413f213e950f50bf4129/bio/fastqc/wrapper.py)
+
+
+rule map_metaT_reads_to_long_read_contigs_w_bwa_mem:
+    input:
+        query_r1=rules.run_fastp_on_metaT_reads.output.r1,
+        query_r2=rules.run_fastp_on_metaT_reads.output.r2,
+        index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.amb"),
+        index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.ann"),
+        index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.bwt"),
+        index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.pac"),
+        index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.sa"),
+        index_fa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly.fna")
+    output: os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bam")
+    params:
+        index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr/merged/{barcode}/assembly"),
+        out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x_{barcode}_contigs")
+    conda: "envs/bwa.yaml"
+    threads: config["bwa"]["threads"]
+    log: os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bwa_mem_samtools")
+    shell:
+        """
+        (date &&\
+        bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
+
+
+rule map_metaT_reads_to_short_read_contigs_w_bwa_mem:
+    input:
+        query_r1=rules.run_fastp_on_metaT_reads.output.r1,
+        query_r2=rules.run_fastp_on_metaT_reads.output.r2,
+        index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.amb"),
+        index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.ann"),
+        index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.bwt"),
+        index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.pac"),
+        index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs.sa"),
+        index_fa=os.path.join(RESULTS_DIR, "assembly/megahit/{sr_sample}/final.contigs.fna")
+    output: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bam")
+    params:
+        index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/{sr_sample}/final.contigs"),
+        out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs")
+    conda: "envs/bwa.yaml"
+    threads: config["bwa"]["threads"]
+    log: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bwa_mem_samtools")
+    shell:
+        """
+        (date &&\
+        bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
+
+
+rule map_metaT_reads_to_hybrid_contigs_w_bwa_mem:
+    input:
+        query_r1=rules.run_fastp_on_metaT_reads.output.r1,
+        query_r2=rules.run_fastp_on_metaT_reads.output.r2,
+        index_amb=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.amb"),
+        index_ann=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.ann"),
+        index_bwt=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.bwt"),
+        index_pac=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.pac"),
+        index_sa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.sa"),
+        index_fa=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.fna")
+    output: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bam")
+    params:
+        index_prefix=os.path.join(RESULTS_DIR, "assembly/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs"),
+        out_prefix=os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs")
+    log: os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bwa_mem_samtools")
+    conda: "envs/bwa.yaml"
+    threads: config["bwa"]["threads"]
+    shell:
+        """
+        (date &&\
+        bwa mem -t {threads} {params.index_prefix} {input.query_r1} {input.query_r2} | samtools view -@ {config[samtools][view][threads]} -SbT {input.index_fa} | samtools sort -@ {config[samtools][sort][threads]} -m {config[samtools][sort][chunk_size]} -T {params.out_prefix} > {output} &&\
+        date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+        """
diff --git a/workflows/MMSEQ_RULES b/workflows/MMSEQ_RULES
new file mode 100644
index 0000000..e02e49d
--- /dev/null
+++ b/workflows/MMSEQ_RULES
@@ -0,0 +1,110 @@
+# For running the MMSEQ2 comparison of proteins after assemblies are run through prokka/prodigal
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+BARCODES=config["barcodes"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mm"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+
+#############################
+######## MMSEQ2  ############
+#############################
+rule mmseq2_database:
+    input:
+        int1=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr/merged/{barcode}/assembly.faa"), assembler="flye", barcode=BARCODES),
+        int2=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{sample}/final.contigs.faa"), assembler="megahit", sample=SAMPLES),
+        int3=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr_{barcode}-sr_{sample}/contigs.faa"), assembler="metaspades_hybrid", barcode=BARCODES, sample=SAMPLES)
+#        expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{prefix}.faa"), assembler=["flye", "megahit", "metaspades_hybrid"])
+    output:
+        dat1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"),
+        dat2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"),
+        dat3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid")
+    log: os.path.join(RESULTS_DIR, "annotation/mmseq2/database.mmseq2.log")
+    conda: "cd-hit.yaml"
+    shell:
+        """
+        (date &&\
+        mmseqs createdb {input.int1} {output.dat1} &&\
+        mmseqs createdb {input.int2} {output.dat2} &&\
+        mmseqs createdb {input.int3} {output.dat3} &&\
+        date) &> >(tee {log})
+        """
+
+rule mmseq2_compare:
+    input:
+        mm1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"),
+        mm2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"),
+        mm3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid")
+    output:
+        mo1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"),
+        mo2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"),
+        mo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")
+    log: os.path.join(RESULTS_DIR, "annotation/mmseq2/compare.mmseq2.log")
+    threads: config["mmseq2"]["threads"]
+    conda: "cd-hit.yaml"
+    shell:
+        """
+        (date &&\
+        mmseqs rbh {input.mm1} {input.mm2} {output.mo1} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
+        mmseqs rbh {input.mm1} {input.mm3} {output.mo2} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
+        mmseqs rbh {input.mm2} {input.mm3} {output.mo3} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
+        date) &> >(tee {log})
+        """
+
+rule mmseq2_m8_format:
+    input:
+        fo1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_db"),
+        fo2=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_db"),
+        fo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/metaspades_hybrid_db"),
+        fo4=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"),
+        fo5=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"),
+        fo6=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")
+    output:
+        form1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit.m8"),
+        form2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid.m8"),
+        form3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8")
+    log: os.path.join(RESULTS_DIR, "annotation/mmseq2/convertalis.mmseq2.log")
+    conda: "cd-hit.yaml"
+    shell:
+        """
+        (date &&\
+        mmseqs convertalis {input.fo1} {input.fo2} {input.fo4} {output.form1}  &&\
+        mmseqs convertalis {input.fo1} {input.fo3} {input.fo5} {output.form2} &&\
+        mmseqs convertalis {input.fo2} {input.fo3} {input.fo6} {output.form3} &&\
+        date) &> >(tee {log})
+        """
+
+rule prepare_plot_files:
+    output:
+        touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done"))
+    log: os.path.join(RESULTS_DIR, "annotation/mmseq2/files_ready.mmseq2.log")
+    shell:
+        """
+        (date &&\
+        ./prepare_plot_files.sh &&\
+        date) &> >(tee {log})
+        """
+
+rule mmseq2_plots:
+    input:
+        plot1=os.path.join(RESULTS_DIR, "annotation/mmseq2/overlap_sizes.txt"),
+        plot2=os.path.join(RESULTS_DIR, "annotation/mmseq2/total.txt")
+    output:
+        touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/upset_plots.done"))
+    log: os.path.join(RESULTS_DIR, "annotation/mmseq2/plot.mmseq2.log")
+    conda: "renv.yaml"
+    shell:
+        """
+        (date &&\
+        Rscript mmseq_plots.R {input.plot1} {input.plot2} &&\
+        date) &> >(tee {log})
+        """
diff --git a/workflows/TAXONOMY_RULES b/workflows/TAXONOMY_RULES
new file mode 100644
index 0000000..e8e1620
--- /dev/null
+++ b/workflows/TAXONOMY_RULES
@@ -0,0 +1,72 @@
+# For running the contamination check and taxonomy on the generated MAGs
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+BARCODES=config["barcodes"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mm"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+
+#####################################
+######## GTDBTK & CHECKM ############
+#####################################
+rule gtdbtk:
+    input:
+           gen=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")
+    threads:config["threads"]
+    conda:"gtdbtk.yaml"
+    params:
+           outdir=os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}")
+    output:
+#           gtdb=directory(os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}")),
+           gtdbfile=os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv")
+    wildcard_constraints:
+           sample='\w+'
+    shell:
+            """
+            (date &&\
+            export GTDBTK_DATA_PATH={config[GTDBTK][DATA]} &&\
+            gtdbtk classify_wf --cpus {threads} -x fa --genome_dir {input.gen} --out_dir {params.outdir} &&\
+            touch gtdbtk.done &&\
+            date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+            """
+
+rule checkm:
+    input:
+           chkm=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")
+    threads:config["threads"]
+    conda:"checkm.yml"
+    output:
+           chkmdir=directory(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}/lineage.ms")),
+           chkout=os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}_output.txt")
+    shell:
+            """
+            (date &&\
+            checkm lineage_wf -r -t {threads} -x fa {input.chkm} {output.chkmdir}  | tee {output.chkout} &&\
+            touch checkm.done &&\
+            date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+            """
+
+rule plot_checkm:
+    input:
+           pl1=os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}"),
+           pl2=os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_DASTool_bins")
+    threads:config["threads"]
+    conda:"/home/users/sbusi/apps/environments/checkm.yml"
+    output:
+           plt=directory(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}/plots"))
+    shell:
+         """
+         date
+         checkm bin_qa_plot --image_type png --width 8 --dpi 200 --font_size 8 -x fa -q {input.pl1} {input.pl2} {output.plt}
+         touch plot_checkm.done
+         date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
+         """
diff --git a/workflows/binning b/workflows/binning
new file mode 100644
index 0000000..84c222c
--- /dev/null
+++ b/workflows/binning
@@ -0,0 +1,28 @@
+# For running the BINNING "workflow"
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+BARCODES=config["barcodes"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mmi"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+
+# specify which rules to run
+include:
+    'BINNING_RULES'
+
+# Rule all for BINNING
+rule BINNING:    
+    input:
+        expand(os.path.join(RESULTS_DIR, "assembly/{sample}.fa"), sample=BINNING_SAMPLES),
+        expand(os.path.join(RESULTS_DIR, "Binning/{sample}/dastool_output/{sample}_proteins.faa"), sample=BINNING_SAMPLES),
+    output:
+        touch('binning_for_ont.done')
diff --git a/workflows/mapping b/workflows/mapping
new file mode 100644
index 0000000..5937936
--- /dev/null
+++ b/workflows/mapping
@@ -0,0 +1,29 @@
+# Workflow for running mapping steps of different assemblers, and mappers for the Binning workflow
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+BARCODES=config["barcodes"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mmi"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+# specify which rules to run
+include:
+    'MAPPING_RULES'
+
+# Rule all for mapping
+rule MAPPING:
+    input:
+        expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/metaspades.bwt"), hybrid_assembler=HYBRID_ASSEMBLER),
+        expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_{reads}_{hybrid_assembler}/{mapper}_{reads}_{hybrid_assembler}.bam"), mapper=MAPPERS, reads=["sr", "lr"], hybrid_assembler=HYBRID_ASSEMBLER),
+        expand(os.path.join(RESULTS_DIR, "mapping/{hybrid_assembler}/{hybrid_assembler}.mmi"), hybrid_assembler=HYBRID_ASSEMBLER),
+        expand(os.path.join(RESULTS_DIR, "mapping/{mapper}_merged_{hybrid_assembler}/{mapper}_merged_{hybrid_assembler}.bam"), mapper=MAPPERS, hybrid_assembler=HYBRID_ASSEMBLER)
+    output:
+        touch('mapping_for_binning.done')
diff --git a/workflows/metat b/workflows/metat
new file mode 100644
index 0000000..5daa5e5
--- /dev/null
+++ b/workflows/metat
@@ -0,0 +1,33 @@
+# Workflow for running mapping metaT reads to different assemblies using different mappers for the Binning workflow
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+BARCODES=config["barcodes"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mmi"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+# specify which rules to run
+include:
+    'METAT_RULES'
+
+# Rule all for mappinng metaT reads to all assemblies
+rule METAT:
+    input:
+        expand(os.path.join(RESULTS_DIR, "preprocessing/metaT/{metaT_sample}.fastp.{report_type}"), metaT_sample="GDB_2018_metaT", report_type=["html", "json"]),
+#       expand(os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}-x-{barcode}.bam"), metaT_sample="GDB_2018_metaT", barcode=BARCODES),
+        expand(os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.bam"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", assembler="megahit"),
+        expand(os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.bam"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", barcode=BARCODES, assembler="metaspades_hybrid"),
+        expand(os.path.join(RESULTS_DIR, "mapping/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.bam"), metaT_sample="GDB_2018_metaT", barcode=BARCODES, assembler=ASSEMBLERS),
+        expand(os.path.join(RESULTS_DIR, "genomecov/metaT/sr/{metaT_sample}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", assembler="megahit"),
+        expand(os.path.join(RESULTS_DIR, "genomecov/metaT/sr/{metaT_sample}_reads-x-lr_{barcode}_sr_{sr_sample}-{assembler}_contigs.avg_cov.txt"), metaT_sample="GDB_2018_metaT", sr_sample="NEB2_MG_S17", barcode=BARCODES, assembler="metaspades_hybrid"),
+        expand(os.path.join(RESULTS_DIR, "genomecov/metaT/lr/{metaT_sample}_reads-x-{barcode}-{assembler}_contigs.avg_cov.txt"), metaT_sample="GDB_2018_metaT", barcode=BARCODES, assembler=ASSEMBLERS)
+    output:
+        touch('metaT_mapping_for_ONT.done')
diff --git a/workflows/mmseq b/workflows/mmseq
new file mode 100644
index 0000000..12ec7dd
--- /dev/null
+++ b/workflows/mmseq
@@ -0,0 +1,36 @@
+# For running the MMSEQ2 "workflow"
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+BARCODES=config["barcodes"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mmi"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+
+# specify which rules to run
+include:
+    'MMSEQ_RULES'
+
+# Rule all for running the MMSEQ2 analyses on the proteins after assembly
+rule MMSEQ:    
+    input:
+        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler=["flye", "megahit", "metaspades_hybrid"]),
+        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh")),
+        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh")),
+        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")),
+        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit.m8")),
+        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid.m8")),
+        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8")),
+        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done")),
+#        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done")),
+        expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/upset_plots.done"))
+    output:
+        touch('mmseq_comparison_for_ont.done')
diff --git a/workflows/taxonomy b/workflows/taxonomy
new file mode 100644
index 0000000..bff6dd7
--- /dev/null
+++ b/workflows/taxonomy
@@ -0,0 +1,28 @@
+# For running the TAXONOMY "workflow"
+
+import os
+from tempfile import TemporaryDirectory
+
+configfile: "CONFIG.yaml"
+DATA_DIR = config["data_dir"]
+RESULTS_DIR = config["results_dir"]
+DB_DIR=config["db_dir"]
+BARCODES=config["barcodes"]
+ASSEMBLERS=config["assemblers"]
+MAPPERS=["bwa", "mmi"]
+SAMPLES=config["samples"]
+BINNING_SAMPLES=config["binning_samples"]
+HYBRID_ASSEMBLER=config["hybrid_assembler"]
+
+
+# specify which rules to run
+include:
+    'TAXONOMY_RULES'
+
+# Rule all for TAXONOMY
+rule BINNING:    
+    input:
+        expand(os.path.join(RESULTS_DIR, "Binning/checkm_output/{sample}_output.txt"), sample=BINNING_SAMPLES),
+        expand(os.path.join(RESULTS_DIR, "Binning/gtdbtk_output/{sample}/gtdbtk.bac120.summary.tsv"), sample=BINNING_SAMPLES)
+    output:
+        touch('taxonomy_for_ont.done')
diff --git a/workflows/workflows_snakefile b/workflows/workflows_snakefile
new file mode 100644
index 0000000..3891453
--- /dev/null
+++ b/workflows/workflows_snakefile
@@ -0,0 +1,47 @@
+# include global functions
+include:
+    "workflow/rules/function.definitions.smk"
+
+# include configuration file
+include:
+    "workflow/rules/ini/config.smk"
+
+# set working directory and dump output
+workdir:
+    OUTPUTDIR
+
+
+# Single omics MG workflow
+elif MG:
+    if 'preprocessing' in IMP_STEPS:
+        if len(MG) == 2:
+            include:
+                "workflow/rules/modules/single_omics/mg/Preprocessing.smk"
+        if len(MG) == 1:
+            include:
+                "workflow/rules/modules/single_omics/mg/PreprocessingSE.smk"
+
+    if 'assembly' in IMP_STEPS:
+        include:
+            "workflow/rules/modules/single_omics/mg/Assembly.smk"
+
+    if 'analysis' in IMP_STEPS:
+        include:
+            "workflow/rules/modules/single_omics/mg/Analysis.smk"
+
+    if 'taxonomy' in IMP_STEPS:
+        include:
+            "workflow/rules/modules/single_omics/mg/Taxonomy.smk"
+
+    if 'binning' in IMP_STEPS:
+        include:
+            "workflow/rules/modules/single_omics/mg/Binning.smk"
+
+# master command
+rule ALL:
+    input:
+        inputs_all
+    output:
+        touch('status/all.done')
+
+
-- 
GitLab