Skip to content
Snippets Groups Projects
Commit 2173f20a authored by Valentina Galata's avatar Valentina Galata
Browse files

Merge branch 'checkpoint_snakefile' into 'master'

# Conflicts:
#   .gitignore
#   config/CONFIG.yaml
parents 2389917f 1a42ca25
No related branches found
No related tags found
2 merge requests!71Master,!67WIP: Checkpoint snakefile
Showing
with 1227 additions and 142 deletions
# File for running ONT analyses
# default configuration file
configfile:"config/CONFIG.yaml"
# default executable for snakmake
shell.executable("bash")
# input settings
RUNS=config['runs']['first']
STEPS=config['steps']
# include rules for the workflows based on "steps" in the CONFIG.yaml file
# ONT analyses workflow
TARGETS = []
if 'assembly_annotation' in STEPS:
include: "workflows/checkpoint_assembly_annotation.smk"
TARGETS += ["assemble_and_coverage.done",
"annotate.done",
"basecall_merge_qc.done",
"coverage_of_references.done",
"prodigal_gene_call.done",
"diamond_proteins.done"]
if 'mmseq' in STEPS:
include: "workflows/mmseq.smk"
TARGETS += ["mmseq_comparison_for_ont.done"]
if 'metaT' in STEPS:
include: "workflows/metat.smk"
TARGETS += ["metaT_mapping_for_ONT.done"]
if 'mapping' in STEPS:
include: "workflows/mapping.smk"
TARGETS += ["mapping_for_binning.done"]
if 'binning' in STEPS:
include: "workflows/binning.smk"
TARGETS += ["binning_for_ont.done"]
if 'taxonomy' in STEPS:
include: "workflows/taxonomy.smk"
TARGETS += ["taxonomy_for_ont.done"]
if 'analysis' in STEPS:
include: "workflows/analysis.smk"
TARGETS += ["analysis.done"]
#else:
# raise Exception('You are not serious. No input data')
# print("No input data provided")
rule all:
input:
TARGETS
......@@ -74,7 +74,7 @@
"mmseq2_compare":
{
"n" : 1,
"ncpus" : 8,
"ncpus" : 12,
"time": "00-04:00:00",
"partition" : "bigmem",
"qos": "qos-bigmem",
......@@ -157,10 +157,10 @@
},
"assemble_long_reads_w_flye":
{
"time" : "0-04:00:00",
"time" : "1-00:00:00",
"n" : 1,
"ncpus": 28,
"partition" : "batch",
"ncpus": 12,
"partition" : "bigmem",
"qos": "qos-batch",
"mail-type" : "ALL",
},
......@@ -214,6 +214,15 @@
"qos": "qos-batch",
"mail-type" : "ALL",
},
"assemble_short_reads_w_metaspades":
{
"time" : "01-00:00:00",
"n" : 1,
"ncpus": 12,
"partition" : "bigmem",
"qos": "qos-bigmem",
"mail-type" : "ALL",
},
"map_short_reads_to_short_read_contigs_w_bwa_mem":
{
"time" : "00-04:00:00",
......
steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"]
# analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
# working directory containing all relevant data,
# i.e. prefix for data, results, DBs etc.
work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
data_dir: "data"
results_dir: "results"
db_dir: "dbs"
runs:
first: "S1_SizeSelected"
second: "S3_Gtube"
# third: "20181108_0827_test"
# assemblers: ["flye"]
assemblers: ["flye", "megahit", "metaspades", "metaspades_hybrid"]
p7zip:
bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
threads: 4
ont_fast5_api:
single_to_multi_fast5:
bin: "single_to_multi_fast5"
batch: 8000
threads: 8
flowcell: "FLO-MIN106"
kit: "SQK-LSK109"
#barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"]
barcodes: ["no_barcode"]
guppy_cpu:
path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
version: "cpu-3.1.5"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
cpu_threads: 28
guppy_gpu:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version: "3.6.0+98ff765"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
hac_config: "dna_r9.4.1_450bps_hac.cfg"
records_per_fastq: 8000
chunk_size: 1000
chunks_per_runner: 1000
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
cpu_threads: 28
guppy_barcoder:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
version: "3.4.5+fb1fbfb"
records_per_fastq: 8000
threads: 8
nanostats:
short_reads_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/raw/short_reads"
#short_reads_prefix: "/mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Apr/fastq"
metaT_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/metaT"
fastp:
min_length: 40
minimap2:
threads: 16
igc:
uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
hg38:
uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
genomecov:
bin: "bedtools genomecov"
compute_avg_coverage:
bin: "scripts/coverage.awk"
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
flye:
bin: "flye"
threads: 27
genome_size: "1g"
operams:
bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
threads: 28
megahit:
threads: 28
nonpareil:
memory: 4096
threads: 14
medaka:
threads: 28
racon:
threads: 28
rebaler:
threads: 28
diamond:
threads: 28
#db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
metaspades:
threads: 28
mmseq2:
threads: 24
# Define sample names
samples: ["ONT3_MG_xx_Rashi_S11"]
binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
# Hybrid assembler
hybrid_assembler: "metaspades_hybrid"
# Directory where fastq files are
#data_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
# Directory to save the output to
#results_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
# Number of cpus or threads to use
threads: 28
# Path to the the 140GB Kraken2 database
kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# Path to DAS_Tool
DAS_Tool:
path: "/home/users/sbusi/apps/DAS_Tool-master"
bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
# Path to DAS_Tool database
# TODO: mv to DAS_Tool
dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Mapping options
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
minimap2:
threads: 24
# Path to GTDBTK database
GTDBTK:
DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# Rscript path
# TODO: mv to DAS_Tool
Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# XXX
mmseqs:
path: "/home/users/sbusi/apps/mmseqs/bin"
createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
# CRISPR
CASC:
PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
minced:
PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
# Plasmid prediction
plasflow:
threshold: 0.7 # class. prob. threshold
minlen: 1000 # rm contigs with length below this threshold
# AMR prediction
rgi:
db_url: "https://card.mcmaster.ca/latest/data"
alignment_tool: "DIAMOND" # DIAMOND or BLAST
steps: "assembly_annotation metaT"
# steps: "assembly_annotation mmseq metaT mapping binning taxonomy"
data_dir: "data"
results_dir: "results"
db_dir: "dbs"
runs:
first: "S1_SizeSelected"
second: "S3_Gtube"
# third: "20181108_0827_test"
barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"]
assemblers: ["flye"]
p7zip:
bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
threads: 4
ont_fast5_api:
single_to_multi_fast5:
bin: "single_to_multi_fast5"
batch: 8000
threads: 8
flowcell: "FLO-MIN106"
kit: "SQK-LSK109"
#barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"]
barcodes: ["barcode07"]
guppy_cpu:
path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
version: "cpu-3.1.5"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
cpu_threads: 28
guppy_gpu:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version: "3.6.0+98ff765"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
hac_config: "dna_r9.4.1_450bps_hac.cfg"
records_per_fastq: 8000
chunk_size: 1000
chunks_per_runner: 1000
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
cpu_threads: 28
guppy_barcoder:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
version: "3.4.5+fb1fbfb"
records_per_fastq: 8000
threads: 8
nanostats:
short_reads_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/raw/short_reads"
#short_reads_prefix: "/mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Apr/fastq"
metaT_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/metaT"
fastp:
min_length: 40
minimap2:
threads: 16
igc:
uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
hg38:
uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
genomecov:
bin: "bedtools genomecov"
compute_avg_coverage:
bin: "scripts/coverage.awk"
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
flye:
bin: "flye"
threads: 27
genome_size: "1g"
operams:
bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
threads: 28
megahit:
threads: 28
nonpareil:
memory: 4096
threads: 14
medaka:
threads: 28
racon:
threads: 28
rebaler:
threads: 28
diamond:
threads: 28
db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
metaspades:
threads: 28
mmseq2:
threads: 24
# Define sample names
#samples: ["flye", "megahit", "metaspades_hybrid"]
# samples: ["flye", "megahit"]
# samples: ["metaspades_hybrid"]
binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
# Hybrid assembler
hybrid_assembler: "metaspades_hybrid"
# Directory where fastq files are
#data_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
# Directory to save the output to
#results_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
# Number of cpus or threads to use
threads: 28
# Path to the the 140GB Kraken2 database
kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# Path to DAS_Tool
DAS_Tool:
path: "/home/users/sbusi/apps/DAS_Tool-master"
bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
# Path to DAS_Tool database
dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Mapping options
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
minimap2:
threads: 24
# Path to GTDBTK database
GTDBTK:
DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# Rscript path
Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
name: analysis
channels:
- anaconda
- conda-forge
- bioconda
- defaults
- r
- etetoolkit
- au-eoed
dependencies:
- _libgcc_mutex=0.1
- _openmp_mutex=4.5
- bzip2=1.0.8
- ca-certificates=2020.4.5.1
- curl=7.69.1
- htslib=1.9
- java-jdk=7.0.91
- krb5=1.17.1
- libcurl=7.69.1
- libdeflate=1.2
- libedit=3.1.20170329
- libgcc-ng=9.2.0
- libgomp=9.2.0
- libssh2=1.9.0
- libstdcxx-ng=9.2.0
- ncurses=6.1
- openssl=1.1.1g
- perl=5.26.2
- samtools=1.9
- tk=8.6.10
- xz=5.2.5
- zlib=1.2.11
prefix: /home/users/sbusi/apps/miniconda3/envs/analysis
name: bbmap
channels:
- imp
- bioconda
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1
- _openmp_mutex=4.5
- alsa-lib=1.1.5
- bbmap=38.79
- fontconfig=2.13.1
- freetype=2.10.2
- giflib=5.2.1
- icu=67.1
- jpeg=9c
- lcms2=2.9
- libgcc-ng=9.2.0
- libgomp=9.2.0
- libiconv=1.15
- libpng=1.6.37
- libstdcxx-ng=9.2.0
- libtiff=4.1.0
- libuuid=2.32.1
- libwebp-base=1.1.0
- libxcb=1.13
- libxml2=2.9.10
- lz4-c=1.9.2
- openjdk=11.0.1
- pthread-stubs=0.4
- xorg-fixesproto=5.0
- xorg-inputproto=2.3.2
- xorg-kbproto=1.0.7
- xorg-libx11=1.6.9
- xorg-libxau=1.0.9
- xorg-libxdmcp=1.1.3
- xorg-libxext=1.3.4
- xorg-libxfixes=5.0.3
- xorg-libxi=1.7.10
- xorg-libxrender=0.9.10
- xorg-libxtst=1.2.3
- xorg-recordproto=1.14.2
- xorg-renderproto=0.11.1
- xorg-xextproto=7.3.0
- xorg-xproto=7.0.31
- xz=5.2.5
- zlib=1.2.11
- zstd=1.4.4
prefix: /home/users/sbusi/apps/miniconda3/envs/bbmap
......@@ -7,24 +7,24 @@ channels:
- etetoolkit
- au-eoed
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=0_gnu
- bzip2=1.0.8=h516909a_2
- ca-certificates=2020.4.5.1=hecc5488_0
- cd-hit=4.8.1=h8b12597_3
- gawk=5.1.0=h516909a_0
- gettext=0.19.8.1=hc5be6a0_1002
- libffi=3.2.1=he1b5a44_1007
- libgcc-ng=9.2.0=h24d8f2e_2
- libgomp=9.2.0=h24d8f2e_2
- libidn2=2.3.0=h516909a_0
- libstdcxx-ng=9.2.0=hdf63c60_2
- libunistring=0.9.10=h14c3975_0
- llvm-openmp=8.0.1=hc9558a2_0
- mmseqs2=11.e1a1c=h2d02072_0
- openmp=8.0.1=0
- openssl=1.1.1f=h516909a_0
- wget=1.20.1=h22169c7_0
- zlib=1.2.11=h516909a_1006
- _libgcc_mutex=0.1
- _openmp_mutex=4.5
- bzip2=1.0.8
- ca-certificates=2020.4.5.1
- cd-hit=4.8.1
- gawk=5.1.0
- gettext=0.19.8.1
- libffi=3.2.1
- libgcc-ng=9.2.0
- libgomp=9.2.0
- libidn2=2.3.0
- libstdcxx-ng=9.2.0
- libunistring=0.9.10
- llvm-openmp=8.0.1
- mmseqs2=11.e1a1c
- openmp=8.0.1
- openssl=1.1.1g
- wget=1.20.1
- zlib=1.2.11
prefix: /home/users/sbusi/apps/miniconda3/envs/cd-hit
......@@ -2,23 +2,3 @@ channels:
- bioconda
dependencies:
- diamond=0.9.25
# if running on esb-compute-01 use the below configuration
#name: diamond
#channels:
# - bioconda
# - conda-forge
# - defaults
#dependencies:
# - _libgcc_mutex=0.1
# - _openmp_mutex=4.5
# - boost-cpp=1.70.0
# - bzip2=1.0.8
# - diamond=0.9.32
# - icu=64.2
# - libgcc-ng=9.2.0
# - libgomp=9.2.0
# - libstdcxx-ng=9.2.0
# - xz=5.2.5
# - zlib=1.2.11
#prefix: /home/susheel.busi/miniconda3/envs/diamond
\ No newline at end of file
name: flye
channels:
- conda-forge
- bioconda
- defaults
- r
- etetoolkit
- au-eoed
dependencies:
- _libgcc_mutex=0.1
- _openmp_mutex=4.5
- ca-certificates=2020.4.5.1
- certifi=2020.4.5.1
- flye=2.7.1
- ld_impl_linux-64=2.34
- libffi=3.2.1
- libgcc-ng=9.2.0
- libgomp=9.2.0
- libstdcxx-ng=9.2.0
- ncurses=6.1
- openssl=1.1.1g
- pip=20.1
- python=3.7.6
- python_abi=3.7
- readline=8.0
- setuptools=46.1.3
- sqlite=3.30.1
- tk=8.6.10
- wheel=0.34.2
- xz=5.2.5
- zlib=1.2.11
prefix: /home/users/sbusi/apps/miniconda3/envs/flye
name: perl
channels:
- anaconda
- conda-forge
- bioconda
- defaults
- r
- etetoolkit
- au-eoed
dependencies:
- libgcc-ng=9.1.0=hdf63c60_0
- perl=5.26.2=h14c3975_0
prefix: /home/users/sbusi/apps/miniconda3/envs/perl
channels:
# for PlasFlow
- smaegol
# rest
- bioconda
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=1_llvm
- _r-mutex=1.0.1=anacondar_1
- backports=1.0=py_2
- backports.weakref=1.0rc1=py35_1
- bioconductor-biocgenerics=0.22.0=r3.3.2_0
- bioconductor-biostrings=2.42.1=r3.3.2_0
- bioconductor-iranges=2.8.2=r3.3.2_0
- bioconductor-s4vectors=0.12.2=r3.3.2_0
- bioconductor-xvector=0.14.1=r3.3.2_0
- bioconductor-zlibbioc=1.20.0=r3.3.2_1
- biopython=1.68=py35_0
- blas=2.16=openblas
- bleach=1.5.0=py35_0
- bzip2=1.0.8=h516909a_2
- ca-certificates=2020.4.5.1=hecc5488_0
- cairo=1.14.6=4
- certifi=2018.8.24=py35_1001
- curl=7.52.1=0
- fontconfig=2.12.1=6
- freetype=2.7=1
- gettext=0.19.8.1=hc5be6a0_1002
- glib=2.51.4=0
- graphite2=1.3.13=he1b5a44_1001
- gsl=2.6=h294904e_0
- harfbuzz=1.4.3=0
- html5lib=0.9999999=py35_0
- icu=58.2=hf484d3e_1000
- jpeg=9c=h14c3975_1001
- libblas=3.8.0=16_openblas
- libcblas=3.8.0=16_openblas
- libffi=3.2.1=he1b5a44_1007
- libgcc=7.2.0=h69d50b8_2
- libgcc-ng=9.2.0=h24d8f2e_2
- libgfortran-ng=7.3.0=hdf63c60_5
- libiconv=1.15=h516909a_1006
- liblapack=3.8.0=16_openblas
- liblapacke=3.8.0=16_openblas
- libopenblas=0.3.9=h5ec1e0e_0
- libpng=1.6.28=1
- libstdcxx-ng=9.2.0=hdf63c60_2
- libtiff=4.0.7=0
- libxml2=2.9.5=0
- llvm-openmp=10.0.0=hc9558a2_0
- markdown=3.2.1=py_0
- mmtf-python=1.0.2=py35_0
- mock=2.0.0=py35_0
- msgpack-python=0.5.6=py35h2d50403_3
- ncurses=5.9=10
- numpy=1.15.2=py35h99e49ec_0
- numpy-base=1.15.2=py35h2f8d375_0
- olefile=0.46=py_0
- openssl=1.0.2u=h516909a_0
- pandas=0.23.4=py35hf8a1672_0
- pango=1.40.4=0
- pbr=5.4.2=py_0
- pcre=8.44=he1b5a44_0
- pillow=4.3.0=py35_0
- pip=20.0.2=py_2
- pixman=0.34.0=h14c3975_1003
- protobuf=3.4.0=py35_0
- python=3.5.4=0
- python-dateutil=2.8.1=py_0
- pytz=2019.3=py_0
- r-base=3.3.2=5
- readline=6.2=0
- reportlab=3.4.0=py35_0
- rpy2=2.7.8=py35r3.3.2_1
- scikit-learn=0.20.0=py35h22eb022_1
- scipy=1.1.0=py35he2b7bc3_1
- setuptools=40.4.3=py35_0
- six=1.14.0=py_1
- sqlite=3.13.0=1
- tensorflow=1.3.0=py35_0
- tk=8.5.19=2
- webencodings=0.5.1=py_1
- werkzeug=1.0.1=pyh9f0ad1d_0
- wheel=0.34.2=py_1
- xz=5.2.5=h516909a_0
- zlib=1.2.8=3
# PlasFlow: should be installed from smaegol, NOT from pip
- plasflow=1.1.0
name: quast
channels:
- bioconda
- conda-forge
- defaults
dependencies:
- blast=2.6.0
- ca-certificates=2020.4.5.1
- certifi=2020.4.5.1
- circos=0.69.8
- cycler=0.10.0
- expat=2.2.9
- fontconfig=2.13.1
- freetype=2.10.2
- giflib=5.2.1
- glimmerhmm=3.0.4
- icu=64.2
- joblib=0.14.1
- jpeg=9c
- kiwisolver=1.2.0
- libblas=3.8.0
- libcblas=3.8.0
- libcxx=10.0.0
- libffi=3.2.1
- libgd=2.2.5
- libgfortran=4.0.0
- libiconv=1.15
- liblapack=3.8.0
- libopenblas=0.3.9
- libpng=1.6.37
- libtiff=4.1.0
- libwebp=1.0.2
- libxml2=2.9.10
- llvm-openmp=10.0.0
- lz4-c=1.9.2
- matplotlib=3.2.1
- matplotlib-base=3.2.1
- ncurses=6.1
- numpy=1.18.4
- openjdk=11.0.1
- openssl=1.1.1g
- perl=5.26.2
- perl-autoloader=5.74
- perl-carp=1.38
- perl-clone=0.42
- perl-config-general=2.63
- perl-digest-perl-md5=1.9
- perl-dynaloader=1.25
- perl-exporter=5.72
- perl-exporter-tiny=1.002001
- perl-extutils-makemaker=7.36
- perl-font-ttf=1.06
- perl-gd=2.71
- perl-io-string=1.08
- perl-list-moreutils=0.428
- perl-list-moreutils-xs=0.428
- perl-math-bezier=0.01
- perl-math-round=0.07
- perl-math-vecstat=0.08
- perl-module-implementation=0.09
- perl-module-runtime=0.016
- perl-number-format=1.75
- perl-params-validate=1.29
- perl-pathtools=3.75
- perl-readonly=2.05
- perl-regexp-common=2017060201
- perl-scalar-list-utils=1.52
- perl-set-intspan=1.19
- perl-statistics-basic=1.6611
- perl-svg=2.84
- perl-text-format=0.59
- perl-threaded=5.26.0
- perl-time-hires=1.9760
- perl-try-tiny=0.30
- perl-xml-parser=2.44
- perl-xsloader=0.24
- pip=20.1
- pyparsing=2.4.7
- python=3.6.10
- python-dateutil=2.8.1
- python_abi=3.6
- quast=5.0.2
- readline=8.0
- setuptools=46.3.0
- simplejson=3.8.1
- six=1.14.0
- sqlite=3.30.1
- tk=8.6.10
- tornado=6.0.4
- wheel=0.34.2
- xz=5.2.5
- zlib=1.2.11
- zstd=1.4.4
prefix: /home/users/sbusi/apps/miniconda3/envs/quast
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=1_llvm
- bamtools=2.5.1=he860b03_5
- bedtools=2.29.2=hc088bd4_0
- biopython=1.72=py36h14c3975_1000
- blast=2.9.0=pl526h3066fca_4
- bowtie2=2.3.5.1=py36he513fc3_0
- bwa=0.7.17=hed695b0_7
- bzip2=1.0.8=h516909a_2
- ca-certificates=2020.4.5.1=hecc5488_0
- certifi=2020.4.5.1=py36h9f0ad1d_0
- curl=7.69.1=h33f0ec9_0
- cycler=0.10.0=py_2
- diamond=0.8.36=h8b12597_4
- entrez-direct=13.3=pl526h375a9b1_0
- expat=2.2.9=he1b5a44_2
- filetype=1.0.7=pyh9f0ad1d_0
- freetype=2.10.2=he06d7ca_0
- htslib=1.9=h4da6232_3
- icu=64.2=he1b5a44_1
- kiwisolver=1.2.0=py36hdb11119_0
- krb5=1.17.1=h2fd8d38_0
- ld_impl_linux-64=2.34=h53a641e_4
- libblas=3.8.0=16_openblas
- libcblas=3.8.0=16_openblas
- libcurl=7.69.1=hf7181ac_0
- libdeflate=1.6=h516909a_0
- libedit=3.1.20191231=h46ee950_0
- libffi=3.2.1=he1b5a44_1007
- libgcc-ng=9.2.0=h24d8f2e_2
- libgfortran-ng=7.5.0=hdf63c60_6
- liblapack=3.8.0=16_openblas
- libopenblas=0.3.9=h5ec1e0e_0
- libpng=1.6.37=hed695b0_1
- libssh2=1.9.0=hab1572f_2
- libstdcxx-ng=9.2.0=hdf63c60_2
- llvm-openmp=10.0.0=hc9558a2_0
- matplotlib-base=3.2.1=py36hb8e4980_0
- ncurses=6.1=hf484d3e_1002
- numpy=1.18.4=py36h7314795_0
- oligoarrayaux=3.8=hc9558a2_0
- openssl=1.1.1g=h516909a_0
- pandas=1.0.4=py36h830a2c2_0
- patsy=0.5.1=py_0
- pcre=8.44=he1b5a44_0
- perl=5.26.2=h516909a_1006
- perl-app-cpanminus=1.7044=pl526_1
- perl-archive-tar=2.32=pl526_0
- perl-base=2.23=pl526_1
- perl-business-isbn=3.004=pl526_0
- perl-business-isbn-data=20140910.003=pl526_0
- perl-carp=1.38=pl526_3
- perl-common-sense=3.74=pl526_2
- perl-compress-raw-bzip2=2.087=pl526he1b5a44_0
- perl-compress-raw-zlib=2.087=pl526hc9558a2_0
- perl-constant=1.33=pl526_1
- perl-data-dumper=2.173=pl526_0
- perl-digest-hmac=1.03=pl526_3
- perl-digest-md5=2.55=pl526_0
- perl-encode=2.88=pl526_1
- perl-encode-locale=1.05=pl526_6
- perl-exporter=5.72=pl526_1
- perl-exporter-tiny=1.002001=pl526_0
- perl-extutils-makemaker=7.36=pl526_1
- perl-file-listing=6.04=pl526_1
- perl-file-path=2.16=pl526_0
- perl-file-temp=0.2304=pl526_2
- perl-html-parser=3.72=pl526h6bb024c_5
- perl-html-tagset=3.20=pl526_3
- perl-html-tree=5.07=pl526_1
- perl-http-cookies=6.04=pl526_0
- perl-http-daemon=6.01=pl526_1
- perl-http-date=6.02=pl526_3
- perl-http-message=6.18=pl526_0
- perl-http-negotiate=6.01=pl526_3
- perl-io-compress=2.087=pl526he1b5a44_0
- perl-io-html=1.001=pl526_2
- perl-io-socket-ssl=2.066=pl526_0
- perl-io-zlib=1.10=pl526_2
- perl-json=4.02=pl526_0
- perl-json-xs=2.34=pl526h6bb024c_3
- perl-libwww-perl=6.39=pl526_0
- perl-list-moreutils=0.428=pl526_1
- perl-list-moreutils-xs=0.428=pl526_0
- perl-lwp-mediatypes=6.04=pl526_0
- perl-lwp-protocol-https=6.07=pl526_4
- perl-mime-base64=3.15=pl526_1
- perl-mozilla-ca=20180117=pl526_1
- perl-net-http=6.19=pl526_0
- perl-net-ssleay=1.88=pl526h90d6eec_0
- perl-ntlm=1.09=pl526_4
- perl-parent=0.236=pl526_1
- perl-pathtools=3.75=pl526h14c3975_1
- perl-scalar-list-utils=1.52=pl526h516909a_0
- perl-socket=2.027=pl526_1
- perl-storable=3.15=pl526h14c3975_0
- perl-test-requiresinternet=0.05=pl526_0
- perl-time-local=1.28=pl526_1
- perl-try-tiny=0.30=pl526_1
- perl-types-serialiser=1.0=pl526_2
- perl-uri=1.76=pl526_0
- perl-www-robotrules=6.02=pl526_3
- perl-xml-namespacesupport=1.12=pl526_0
- perl-xml-parser=2.44_01=pl526ha1d75be_1002
- perl-xml-sax=1.02=pl526_0
- perl-xml-sax-base=1.09=pl526_0
- perl-xml-sax-expat=0.51=pl526_3
- perl-xml-simple=2.25=pl526_1
- perl-xsloader=0.24=pl526_0
- pip=20.1.1=py_1
- prodigal=2.6.3=h516909a_2
- pyahocorasick=1.4.0=py36h8c4c3a4_1
- pyfaidx=0.5.8=py_1
- pyparsing=2.4.7=pyh9f0ad1d_0
- python=3.6.10=h8356626_1011_cpython
- python-dateutil=2.8.1=py_0
- python_abi=3.6=1_cp36m
- pytz=2020.1=pyh9f0ad1d_0
- readline=8.0=hf8c457e_0
- rgi=5.1.0=py_1
- samtools=1.9=h10a08f8_12
- scipy=1.4.1=py36h2d22cac_3
- seaborn=0.10.1=py_0
- setuptools=47.1.1=py36h9f0ad1d_0
- six=1.15.0=pyh9f0ad1d_0
- sqlite=3.30.1=hcee41ef_0
- statsmodels=0.11.1=py36h8c4c3a4_1
- tbb=2020.1=hc9558a2_0
- tk=8.6.10=hed695b0_0
- tornado=6.0.4=py36h8c4c3a4_1
- wheel=0.34.2=py_1
- xz=5.2.5=h516909a_0
- zlib=1.2.11=h516909a_1006
name: snakemake
name: ONT_pilot
channels:
- conda-forge
- bioconda
......@@ -169,5 +169,3 @@ dependencies:
- zipp=2.1.0
- zlib=1.2.11
- zstd=1.4.4
prefix: /home/users/sbusi/apps/miniconda3/envs/snakemake
# For the following assessments
# 1. levels of partial and unique gene calls after prodigal analyses on assemblies
# 2. quast
# 3. mappability_index
# 4. uniquely mapped reads from BAM files
# 5. CRISPRS - via 'minced' and 'CASC'
# 6. Plasmids
############################################
######## Partial_vs_unique_ORFs ############
############################################
rule prodigal_partial_gene_comparison:
input:
int1=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{sample}/final.contigs.faa"), assembler="metaspades", sample=SAMPLES),
int2=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr_{barcode}-sr_{sample}/contigs.faa"), assembler="metaspades_hybrid", barcode=BARCODES, sample=SAMPLES)
output:
dat1=expand(os.path.join(RESULTS_DIR, "analysis/cdhit/{assembler}_partial_counts.txt"), assembler="metaspades"),
dat2=expand(os.path.join(RESULTS_DIR, "analysis/cdhit/{assembler}_partial_counts.txt"), assembler="metaspades_hybrid"),
dat3=os.path.join(RESULTS_DIR, "analysis/cdhit/partial_gene_counts.txt")
log: os.path.join(RESULTS_DIR, "analysis/cdhit/compare.log")
shell:
"""
(date &&\
grep -E -o "partial.{0,3}" {input.int1} | grep '10\|01\|11' | wc -l > {output.dat1} &&\
grep -E -o "partial.{0,3}" {input.int2} | grep '10\|01\|11' | wc -l > {output.dat2} &&\
cat {output.dat1} {output.dat2} | paste - - > {output.dat3} &&\
date) &> >(tee {log})
"""
rule bbmap_rename:
input:
mm1=rules.prodigal_partial_gene_comparison.input.int1,
mm2=rules.prodigal_partial_gene_comparison.input.int2
output:
mo1=os.path.join(RESULTS_DIR, "analysis/cdhit/spades.faa"),
mo2=os.path.join(RESULTS_DIR, "analysis/cdhit/hybrid.faa"),
log: os.path.join(RESULTS_DIR, "analysis/cdhit/bbmap.rename.log")
threads: config["mmseq2"]["threads"]
conda: "../envs/bbmap.yaml"
shell:
"""
(date &&\
rename.sh in={input.mm1} out={output.mo1} prefix=spades ignorejunk=t &&\
rename.sh in={input.mm2} out={output.mo2} prefix=hybrid ignorejunk=t &&\
date) &> >(tee {log})
"""
rule cdhit_compare:
input:
co1=rules.bbmap_rename.output.mo1,
co2=rules.bbmap_rename.output.mo2
output:
form1=os.path.join(RESULTS_DIR, "analysis/cdhit/spades_hybrid.fa"),
form2=os.path.join(RESULTS_DIR, "analysis/cdhit/hybrid_spades.fa"),
form3=os.path.join(RESULTS_DIR, "analysis/cdhit/cdhit_unique_counts.txt"),
form4=os.path.join(RESULTS_DIR, "analysis/cdhit/cdhit_compare.done")
log: os.path.join(RESULTS_DIR, "analysis/cdhit/cdhit.unique.log")
conda: "../envs/cd-hit.yaml"
shell:
"""
(date &&\
cd-hit-2d -i {input.co1} -i2 {input.co2} -o {output.form1} -c 0.9 -n 5 -d 0 -M 16000 -T 8 &&\
cd-hit-2d -i {input.co2} -i2 {input.co1} -o {output.form2} -c 0.9 -n 5 -d 0 -M 16000 -T 8 &&\
echo {output.form1} > tmp && grep -c '>' {output.form1} >> tmp &&\
echo {output.form2} >> tmp && grep -c '>' {output.form2} >> tmp &&\
cat tmp | paste - - > {output.form3} &&\
touch {output.form4}
date) &> >(tee {log})
"""
#########
# QUAST #
#########
rule quast:
input:
expand(os.path.join(RESULTS_DIR, "assembly/{assembler}.fa"), assembler=["flye", "megahit", "metaspades", "metaspades_hybrid"])
output:
os.path.join(RESULTS_DIR, "analysis/quast/report.txt")
log: os.path.join(RESULTS_DIR, "analysis/quast/quast.assemblies.log")
threads: config["mmseq2"]["threads"]
conda: "../envs/quast.yaml"
shell:
"""
(date &&\
metaquast.py --max-ref-number 0 --threads {threads} {input} -o $(dirname {output}) &&\
date) &> >(tee {log})
"""
#####################
# Mappability_index #
#####################
rule mappability_and_without_small_contigs_metaG:
input:
map1=os.path.join(RESULTS_DIR, "mapping/{mapping_sample}/{mapping_sample}.bam")
output:
gout1=os.path.join(RESULTS_DIR, "analysis/mappability_index/mi_{mapping_sample}.txt"),
gout2=os.path.join(RESULTS_DIR, "analysis/mappability_index/{mapping_sample}_counts.txt")
log: os.path.join(RESULTS_DIR, "analysis/mappability_index/mappability_index_{mapping_sample}.log")
conda: "../envs/analysis.yaml"
shell:
"""
(date &&\
samtools flagstat {input.map1} > {output.gout1} &&\
samtools idxstats {input.map1} > {output.gout2} &&\
date) &> >(tee {log})
"""
rule mappability_and_without_small_contigs_metaT:
input:
map2=os.path.join(RESULTS_DIR, "mapping/metaT/sr/{metaT_sample}_reads-x-{metaT_mapping_sample}_contigs.bam")
output:
tout1=os.path.join(RESULTS_DIR, "analysis/mappability_index/mi_{metaT_sample}_reads-x-{metaT_mapping_sample}.txt"),
tout2=os.path.join(RESULTS_DIR, "analysis/mappability_index/{metaT_sample}_reads-x-{metaT_mapping_sample}_counts.txt")
log: os.path.join(RESULTS_DIR, "analysis/mappability_index/mappability_index_metaT_{metaT_sample}_reads-x-{metaT_mapping_sample}.log")
conda: "../envs/analysis.yaml"
shell:
"""
(date &&\
samtools flagstat {input.map2} > {output.tout1} &&\
samtools idxstats {input.map2} > {output.tout2} &&\
date) &> >(tee {log})
"""
rule unique_mapping_reads:
input:
rules.mappability_and_without_small_contigs_metaG.input.map1
output:
os.path.join(RESULTS_DIR, "analysis/mappability_index/{mapping_sample}_unique_read_count.txt")
log: os.path.join(RESULTS_DIR, "analysis/mappability_index/{mapping_sample}_unique_read_count.log")
conda: "../envs/analysis.yaml"
shell:
"""
(date &&\
samtools view {input} | grep -v "XT:A:U" | uniq | wc -l > {output} &&\
date) &> >(tee {log})
"""
##########
# CRISPR #
##########
rule casc:
input:
os.path.join(RESULTS_DIR, "assembly/{assembly}.fa")
output:
cas1=os.path.join(RESULTS_DIR, "analysis/crispr/casc/{assembly}_casc_output/{assembly}.results.txt")
# cas2=os.path.join(RESULTS_DIR, "analysis/crispr/casc/{assembly}_casc_output/casc_CRISPR_output.txt")
# cas3=directory(os.path.join(RESULTS_DIR, "analysis/crispr/casc/{assembly}_casc_output"))
log: os.path.join(RESULTS_DIR, "analysis/crispr/casc/{assembly}_casc_output/casc.log")
threads: config["mmseq2"]["threads"]
conda: "../envs/analysis.yaml"
shell:
"""
export PATH={config[CASC][PATH]}
export PERL5LIB={config[CASC][PERL5LIB]}
casc -i {input} -o $(dirname {output.cas1}) -n 12 --conservative
echo $(basename -s ".results.txt" {output.cas1}) >> results/analysis/crispr/casc/casc_CRISPR_output.txt
cat {output.cas1} | awk '$8 == "true" {{sum += $2}} END {{print sum}}' >> results/analysis/crispr/casc/casc_CRISPR_output.txt
"""
rule minced:
input:
os.path.join(RESULTS_DIR, "assembly/{assembly}.fa")
output:
mn1=os.path.join(RESULTS_DIR, "analysis/crispr/minced/{assembly}.txt"),
mn2=os.path.join(RESULTS_DIR, "analysis/crispr/minced/{assembly}.gff")
log: os.path.join(RESULTS_DIR, "analysis/crispr/minced/{assembly}_minced.log")
conda: "../envs/analysis.yaml"
shell:
"""
(date &&\
export PATH={config[minced][PATH]} &&\
minced {input} {output.mn1} {output.mn2} &&\
echo $(basename -s ".txt" {output.mn1}) >> results/analysis/crispr/minced/minced_CRISPR_output.txt &&\
grep -c 'CRISPR' {output.mn1} >> results/analysis/crispr/minced/minced_CRISPR_output.txt &&\
date) &> >(tee {log})
"""
############
# PlasFlow #
############
# Filter input FASTA by seq. length
rule plasflow_input:
input:
os.path.join(RESULTS_DIR, "assembly/{assembly}.fa")
output:
temp(os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.fna"))
log:
os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.fna.log")
params:
script=os.path.join(SRC_DIR, "filter_fasta_by_length.pl"),
minlen=config["plasflow"]["minlen"]
message:
"Plasmid prediction w/ PlasFlow: {input}"
shell:
# script from PathoFact
"{params.script} {params.minlen} {input} > {output} 2> {log}"
# Run PlasFlow
rule plasflow:
input:
os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.fna")
output:
os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.tsv")
log:
os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.tsv.log")
params:
threshold=config["plasflow"]["threshold"]
conda:
os.path.join(ENV_DIR, "plasflow.yaml")
message:
"Plasmid prediction w/ PlasFlow: {input}"
shell:
"PlasFlow.py --input {input} --output {output}.tmp --threshold {params.threshold} &> {log} && "
"cut -f3,4,6- {output}.tmp > {output} && "
"rm {output}.tmp*"
#######
# RGI #
#######
# RGI input: proteins
# NOTE: remove stop codon symbol "*"
# NOTE: one rule per assembly to have a workaround for the issue with file paths
# should be resolved properly later
rule rgi_input_flye:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/flye/lr/merged/no_barcode/assembly.faa"))
output:
temp(os.path.join(RESULTS_DIR, "analysis/rgi/flye.faa"))
shell:
"sed 's/\*$//' {input} > {output}"
rule rgi_input_metaspades_hybrid:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.faa"))
output:
temp(os.path.join(RESULTS_DIR, "analysis/rgi/metaspades_hybrid.faa"))
shell:
"sed 's/\*$//' {input} > {output}"
rule rgi_input_metaspades:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades/ONT3_MG_xx_Rashi_S11/final.contigs.faa"))
output:
temp(os.path.join(RESULTS_DIR, "analysis/rgi/metaspades.faa"))
shell:
"sed 's/\*$//' {input} > {output}"
rule rgi_input_megahit:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/megahit/ONT3_MG_xx_Rashi_S11/final.contigs.faa"))
output:
temp(os.path.join(RESULTS_DIR, "analysis/rgi/megahit.faa"))
shell:
"sed 's/\*$//' {input} > {output}"
# RGI DBs
rule rgi_db:
output:
archive=temp(os.path.join(DB_DIR, "rgi/card-data.tar.bz2")),
json=os.path.join(DB_DIR, "rgi/card.json")
log:
os.path.join(DB_DIR, "rgi/rgi.log")
params:
db_url=config["rgi"]["db_url"]
message:
"Download RGI DB data"
shell:
"wget -O {output.archive} {params.db_url} --no-check-certificate &> {log} && "
"tar -C $(dirname {output.archive}) -xvf {output.archive} &>> {log}"
# Run RGI: proteins
rule rgi_prot:
input:
faa=os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.faa"),
db=os.path.join(DB_DIR, "rgi/card.json")
output:
os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.txt")
log:
os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.log")
threads: 10
params:
alignment_tool=config["rgi"]["alignment_tool"],
obname=lambda wildcards, output: os.path.splitext(output[0])[0]
conda:
os.path.join(ENV_DIR, "rgi.yaml")
message:
"AMR prediction w/ RGI: {input}"
shell:
# NOTE: to make sure that the correct DB is used
"rgi clean --local &> {log} && "
"rgi load --card_json {input.db} --local &>> {log} && "
"rgi database --version --local &>> {log} && "
# NOTE: https://github.com/arpcard/rgi/issues/93: KeyError: 'snp'
# need to run the CMD twice
"rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log} || "
"rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log}"
# For running the ASSEMBLY and ANNOTATION workflow for ONT data
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
RUNS=[config["runs"]["first"],
config["runs"]["second"]]
# config["runs"]["third"]]
BARCODES=config["barcodes"]
SAMPLES=["ONT3_MG_xx_Rashi_S11"]
sr_sample=["ONT3_MG_xx_Rashi_S11"]
# SAMPLES_ALL=config["samples"]
REFERENCES=["igc", "hg38"]
IGC_URI=config["igc"]["uri"]
HG38_URI=config["hg38"]["uri"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa_mem", "minimap2"]
######
# RULES
######
......@@ -512,7 +491,8 @@ rule assemble_long_reads_w_flye:
output: os.path.join(RESULTS_DIR, "assembly/flye/lr/merged/{barcode}/assembly.fna")
threads: config["flye"]["threads"]
log: os.path.join(RESULTS_DIR, "assembly/flye/lr/merged/{barcode}/assembly.flye")
conda: "../envs/flye.yaml"
# conda: "../envs/flye.yaml"
conda: "../envs/flye_v2_7.yaml"
shell:
"""
(date &&\
......@@ -779,6 +759,26 @@ rule assemble_short_reads_w_megahit:
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
rule assemble_short_reads_w_metaspades:
input:
sr_r1=rules.run_fastp_on_short_reads.output.r1,
sr_r2=rules.run_fastp_on_short_reads.output.r2
output: os.path.join(RESULTS_DIR, "assembly/metaspades/{sr_sample}/final.contigs.fna")
threads: config["metaspades"]["threads"]
conda: "../envs/spades.yaml"
params:
tmp_dir=os.path.join(RESULTS_DIR, "assembly/metaspades/{sr_sample}_tmp")
log: os.path.join(RESULTS_DIR, "assembly/metaspades/{sr_sample}/metaspades.log")
shell:
"""
(date &&\
metaspades.py -k 21,33,55,77 -t {threads} -1 {input.sr_r1} -2 {input.sr_r2} -o {params.tmp_dir} &&\
mv {params.tmp_dir}/* $(dirname {output}) &&\
rmdir {params.tmp_dir} &&\
ln -s contigs.fasta {output} &&\
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
# N.B. A similar function exists above and code duplication is generally discouraged.
rule map_short_reads_to_short_read_contigs_w_bwa_mem:
input:
......
# Rules for BINNING workflow, i.e. generating MAGs from assemblies
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mm"]
#SAMPLES=config["samples"]
SAMPLES=["flye", "megathit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
###################
# Preparing files #
###################
......@@ -55,7 +39,7 @@ rule prepare_assembly_files:
ln {input.bam2} {output.bout2}
date) 2> >(tee {log}.stderr) > >(tee {log}.stdout)
"""
#####################################
########### MAXBIN2 #################
......
# Rule for running the mapping "workflow" for ONT analsyes
#shell.executable("/bin/bash")
#shell.prefix("source ~/.bashrc; ")
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
#SAMPLES=config["samples"]
SAMPLES=["flye", "megathit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
#######################
## Mapping to hybrid ##
#######################
......
# Rules for mapping the metaT reads to the different assemblies
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
# SAMPLES=config["samples"]
SAMPLES=["flye", "megathit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
###########
## metaT ##
###########
......
# For running the MMSEQ2 comparison of proteins after assemblies are run through prokka/prodigal
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mm"]
# SAMPLES=config["samples"]
SAMPLES=["flye", "megahit", "metaspades_hybrid"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
#############################
######## MMSEQ2 ############
#############################
......@@ -23,12 +7,14 @@ rule mmseq2_database:
input:
int1=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr/merged/{barcode}/assembly.faa"), assembler="flye", barcode=BARCODES),
int2=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{sr_sample}/final.contigs.faa"), assembler="megahit", sr_sample="ONT3_MG_xx_Rashi_S11"),
int3=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.faa"), assembler="metaspades_hybrid", barcode=BARCODES, sr_sample="ONT3_MG_xx_Rashi_S11")
int3=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/lr_{barcode}-sr_{sr_sample}/contigs.faa"), assembler="metaspades_hybrid", barcode=BARCODES, sr_sample="ONT3_MG_xx_Rashi_S11"),
int4=expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{sr_sample}/final.contigs.faa"), assembler="metaspades", sr_sample="ONT3_MG_xx_Rashi_S11"),
# expand(os.path.join(RESULTS_DIR, "annotation/proteins/{assembler}/{prefix}.faa"), assembler=["flye", "megahit", "metaspades_hybrid"])
output:
dat1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"),
dat2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"),
dat3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid")
dat3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid"),
dat4=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/database.mmseq2.log")
# conda: "../envs/cd-hit.yaml"
shell:
......@@ -37,6 +23,7 @@ rule mmseq2_database:
{config[mmseqs][createdb]} {input.int1} {output.dat1} &&\
{config[mmseqs][createdb]} {input.int2} {output.dat2} &&\
{config[mmseqs][createdb]} {input.int3} {output.dat3} &&\
{config[mmseqs][createdb]} {input.int4} {output.dat4} &&\
date) &> >(tee {log})
"""
......@@ -44,11 +31,15 @@ rule mmseq2_compare:
input:
mm1=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="flye"),
mm2=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="megahit"),
mm3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid")
mm3=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades_hybrid"),
mm4=expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler="metaspades")
output:
mo1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"),
mo2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"),
mo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")
mo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh"),
mo4=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_rbh"),
mo5=os.path.join(RESULTS_DIR, "annotation/mmseq2/metaspades_metaspades_hybrid_rbh"),
mo6=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_rbh")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/compare.mmseq2.log")
threads: config["mmseq2"]["threads"]
# conda: "../envs/cd-hit.yaml"
......@@ -58,6 +49,9 @@ rule mmseq2_compare:
{config[mmseqs][rbh]} {input.mm1} {input.mm2} {output.mo1} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
{config[mmseqs][rbh]} {input.mm1} {input.mm3} {output.mo2} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
{config[mmseqs][rbh]} {input.mm2} {input.mm3} {output.mo3} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
{config[mmseqs][rbh]} {input.mm2} {input.mm4} {output.mo4} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
{config[mmseqs][rbh]} {input.mm3} {input.mm4} {output.mo5} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
{config[mmseqs][rbh]} {input.mm4} {input.mm1} {output.mo6} --min-seq-id 0.9 mmseq2_tmp --threads {threads} &&\
date) &> >(tee {log})
"""
......@@ -68,11 +62,18 @@ rule mmseq2_m8_format:
fo3=os.path.join(RESULTS_DIR, "annotation/mmseq2/metaspades_hybrid_db"),
fo4=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh"),
fo5=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh"),
fo6=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")
fo6=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh"),
fo7=os.path.join(RESULTS_DIR, "annotation/mmseq2/metaspades_db"),
fo8=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_rbh"),
fo9=os.path.join(RESULTS_DIR, "annotation/mmseq2/metaspades_metaspades_hybrid_rbh"),
fo10=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_rbh")
output:
form1=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit.m8"),
form2=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid.m8"),
form3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8")
form3=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8"),
form4=os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades.m8"),
form5=os.path.join(RESULTS_DIR, "annotation/mmseq2/metaspades_metaspades_hybrid.m8"),
form6=os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades.m8")
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/convertalis.mmseq2.log")
# conda: "../envs/cd-hit.yaml"
shell:
......@@ -81,6 +82,9 @@ rule mmseq2_m8_format:
{config[mmseqs][convertalis]} {input.fo1} {input.fo2} {input.fo4} {output.form1} &&\
{config[mmseqs][convertalis]} {input.fo1} {input.fo3} {input.fo5} {output.form2} &&\
{config[mmseqs][convertalis]} {input.fo2} {input.fo3} {input.fo6} {output.form3} &&\
{config[mmseqs][convertalis]} {input.fo2} {input.fo7} {input.fo8} {output.form4} &&\
{config[mmseqs][convertalis]} {input.fo3} {input.fo7} {input.fo9} {output.form5} &&\
{config[mmseqs][convertalis]} {input.fo7} {input.fo1} {input.fo10} {output.form6} &&\
date) &> >(tee {log})
"""
......@@ -94,9 +98,10 @@ rule prepare_plot_files:
shell:
"""
(date &&\
scripts/prepare_plot_files.sh &&\
scripts/prepare_plot_files_w_metaspades.sh &&\
date) &> >(tee {log})
"""
# scripts/prepare_plot_files.sh &&\
rule mmseq2_plots:
input:
......@@ -109,6 +114,7 @@ rule mmseq2_plots:
shell:
"""
(date &&\
Rscript scripts/mmseq_plots.R {input.plot1} {input.plot2} &&\
Rscript scripts/mmseq_plots_w_metaspades.R {input.plot1} {input.plot2} &&\
date) &> >(tee {log})
"""
# Rscript scripts/mmseq_plots.R {input.plot1} {input.plot2} &&\
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment