Skip to content
Snippets Groups Projects

Add PlasFlow and RGI

Merged Valentina Galata requested to merge checkpoint_snakefile_VG into checkpoint_snakefile
1 file
+ 6
4
Compare changes
  • Side-by-side
  • Inline
+ 119
16
@@ -6,22 +6,6 @@
# 5. CRISPRS - via 'minced' and 'CASC'
# 6. Plasmids
import os
from tempfile import TemporaryDirectory
configfile: "config/CONFIG.yaml"
DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR=config["db_dir"]
BARCODES=config["barcodes"]
# ASSEMBLERS=config["assemblers"]
MAPPERS=["bwa", "mmi"]
# SAMPLES=config["samples"]
SAMPLES=["ONT3_MG_xx_Rashi_S11"]
BINNING_SAMPLES=config["binning_samples"]
HYBRID_ASSEMBLER=config["hybrid_assembler"]
############################################
######## Partial_vs_unique_ORFs ############
############################################
@@ -190,3 +174,122 @@ rule minced:
date) &> >(tee {log})
"""
############
# PlasFlow #
############
# Filter input FASTA by seq. length
rule plasflow_input:
input:
os.path.join(RESULTS_DIR, "assembly/{assembly}.fa")
output:
temp(os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.fna"))
log:
os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.fna.log")
params:
script=os.path.join(SRC_DIR, "filter_fasta_by_length.pl"),
minlen=config["plasflow"]["minlen"]
message:
"Plasmid prediction w/ PlasFlow: {input}"
shell:
# script from PathoFact
"{params.script} {params.minlen} {input} > {output} 2> {log}"
# Run PlasFlow
rule plasflow:
input:
os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.fna")
output:
os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.tsv")
log:
os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.tsv.log")
params:
threshold=config["plasflow"]["threshold"]
conda:
os.path.join(ENV_DIR, "plasflow.yaml")
message:
"Plasmid prediction w/ PlasFlow: {input}"
shell:
"PlasFlow.py --input {input} --output {output}.tmp --threshold {params.threshold} &> {log} && "
"cut -f3,4,6- {output}.tmp > {output} && "
"rm {output}.tmp*"
#######
# RGI #
#######
# RGI input: proteins
# NOTE: remove stop codon symbol "*"
# NOTE: one rule per assembly to have a workaround for the issue with file paths
# should be resolved properly later
rule rgi_input_flye:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/flye/lr/merged/no_barcode/assembly.faa"))
output:
temp(os.path.join(RESULTS_DIR, "analysis/rgi/flye.faa"))
shell:
"sed 's/\*$//' {input} > {output}"
rule rgi_input_metaspades_hybrid:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.faa"))
output:
temp(os.path.join(RESULTS_DIR, "analysis/rgi/metaspades_hybrid.faa"))
shell:
"sed 's/\*$//' {input} > {output}"
rule rgi_input_metaspades:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades/ONT3_MG_xx_Rashi_S11/final.contigs.faa"))
output:
temp(os.path.join(RESULTS_DIR, "analysis/rgi/metaspades.faa"))
shell:
"sed 's/\*$//' {input} > {output}"
rule rgi_input_megahit:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/megahit/ONT3_MG_xx_Rashi_S11/final.contigs.faa"))
output:
temp(os.path.join(RESULTS_DIR, "analysis/rgi/megahit.faa"))
shell:
"sed 's/\*$//' {input} > {output}"
# RGI DBs
rule rgi_db:
output:
archive=temp(os.path.join(DB_DIR, "rgi/card-data.tar.bz2")),
json=os.path.join(DB_DIR, "rgi/card.json")
log:
os.path.join(DB_DIR, "rgi/rgi.log")
params:
db_url=config["rgi"]["db_url"]
message:
"Download RGI DB data"
shell:
"wget -O {output.archive} {params.db_url} --no-check-certificate &> {log} && "
"tar -C $(dirname {output.archive}) -xvf {output.archive} &>> {log}"
# Run RGI: proteins
rule rgi_prot:
input:
faa=os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.faa"),
db=os.path.join(DB_DIR, "rgi/card.json")
output:
os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.txt")
log:
os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.log")
threads: 10
params:
alignment_tool=config["rgi"]["alignment_tool"],
obname=lambda wildcards, output: os.path.splitext(output[0])[0]
conda:
os.path.join(ENV_DIR, "rgi.yaml")
message:
"AMR prediction w/ RGI: {input}"
shell:
# NOTE: to make sure that the correct DB is used
"rgi clean --local &> {log} && "
"rgi load --card_json {input.db} --local &>> {log} && "
"rgi database --version --local &>> {log} && "
# NOTE: https://github.com/arpcard/rgi/issues/93: KeyError: 'snp'
# need to run the CMD twice
"rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log} || "
"rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log}"
Loading