Skip to content
Snippets Groups Projects
Commit 5a7350e8 authored by Valentina Galata's avatar Valentina Galata
Browse files

added rgi (issue #29)

parent 69406c10
No related branches found
No related tags found
2 merge requests!69Add PlasFlow and RGI,!67WIP: Checkpoint snakefile
# steps: "assembly_annotation mapping metaT mmseq binning taxonomy analysis"
# steps: "binning taxonomy analysis"
steps: "analysis"
# analysis_steps: ["cdhit", "mappability", "crispr", "plasflow"]
analysis_steps: ["plasflow"]
# analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
analysis_steps: ["plasmids", "amr"]
# working directory containing all relevant data,
# i.e. prefix for data, results, DBs etc.
work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
......@@ -171,4 +171,8 @@ minced:
plasflow:
threshold: 0.7 # class. prob. threshold
minlen: 1000 # rm contigs with length below this threshold
\ No newline at end of file
# AMR prediction
rgi:
db_url: "https://card.mcmaster.ca/latest/data"
alignment_tool: "DIAMOND" # DIAMOND or BLAST
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=1_llvm
- bamtools=2.5.1=he860b03_5
- bedtools=2.29.2=hc088bd4_0
- biopython=1.72=py36h14c3975_1000
- blast=2.9.0=pl526h3066fca_4
- bowtie2=2.3.5.1=py36he513fc3_0
- bwa=0.7.17=hed695b0_7
- bzip2=1.0.8=h516909a_2
- ca-certificates=2020.4.5.1=hecc5488_0
- certifi=2020.4.5.1=py36h9f0ad1d_0
- curl=7.69.1=h33f0ec9_0
- cycler=0.10.0=py_2
- diamond=0.8.36=h8b12597_4
- entrez-direct=13.3=pl526h375a9b1_0
- expat=2.2.9=he1b5a44_2
- filetype=1.0.7=pyh9f0ad1d_0
- freetype=2.10.2=he06d7ca_0
- htslib=1.9=h4da6232_3
- icu=64.2=he1b5a44_1
- kiwisolver=1.2.0=py36hdb11119_0
- krb5=1.17.1=h2fd8d38_0
- ld_impl_linux-64=2.34=h53a641e_4
- libblas=3.8.0=16_openblas
- libcblas=3.8.0=16_openblas
- libcurl=7.69.1=hf7181ac_0
- libdeflate=1.6=h516909a_0
- libedit=3.1.20191231=h46ee950_0
- libffi=3.2.1=he1b5a44_1007
- libgcc-ng=9.2.0=h24d8f2e_2
- libgfortran-ng=7.5.0=hdf63c60_6
- liblapack=3.8.0=16_openblas
- libopenblas=0.3.9=h5ec1e0e_0
- libpng=1.6.37=hed695b0_1
- libssh2=1.9.0=hab1572f_2
- libstdcxx-ng=9.2.0=hdf63c60_2
- llvm-openmp=10.0.0=hc9558a2_0
- matplotlib-base=3.2.1=py36hb8e4980_0
- ncurses=6.1=hf484d3e_1002
- numpy=1.18.4=py36h7314795_0
- oligoarrayaux=3.8=hc9558a2_0
- openssl=1.1.1g=h516909a_0
- pandas=1.0.4=py36h830a2c2_0
- patsy=0.5.1=py_0
- pcre=8.44=he1b5a44_0
- perl=5.26.2=h516909a_1006
- perl-app-cpanminus=1.7044=pl526_1
- perl-archive-tar=2.32=pl526_0
- perl-base=2.23=pl526_1
- perl-business-isbn=3.004=pl526_0
- perl-business-isbn-data=20140910.003=pl526_0
- perl-carp=1.38=pl526_3
- perl-common-sense=3.74=pl526_2
- perl-compress-raw-bzip2=2.087=pl526he1b5a44_0
- perl-compress-raw-zlib=2.087=pl526hc9558a2_0
- perl-constant=1.33=pl526_1
- perl-data-dumper=2.173=pl526_0
- perl-digest-hmac=1.03=pl526_3
- perl-digest-md5=2.55=pl526_0
- perl-encode=2.88=pl526_1
- perl-encode-locale=1.05=pl526_6
- perl-exporter=5.72=pl526_1
- perl-exporter-tiny=1.002001=pl526_0
- perl-extutils-makemaker=7.36=pl526_1
- perl-file-listing=6.04=pl526_1
- perl-file-path=2.16=pl526_0
- perl-file-temp=0.2304=pl526_2
- perl-html-parser=3.72=pl526h6bb024c_5
- perl-html-tagset=3.20=pl526_3
- perl-html-tree=5.07=pl526_1
- perl-http-cookies=6.04=pl526_0
- perl-http-daemon=6.01=pl526_1
- perl-http-date=6.02=pl526_3
- perl-http-message=6.18=pl526_0
- perl-http-negotiate=6.01=pl526_3
- perl-io-compress=2.087=pl526he1b5a44_0
- perl-io-html=1.001=pl526_2
- perl-io-socket-ssl=2.066=pl526_0
- perl-io-zlib=1.10=pl526_2
- perl-json=4.02=pl526_0
- perl-json-xs=2.34=pl526h6bb024c_3
- perl-libwww-perl=6.39=pl526_0
- perl-list-moreutils=0.428=pl526_1
- perl-list-moreutils-xs=0.428=pl526_0
- perl-lwp-mediatypes=6.04=pl526_0
- perl-lwp-protocol-https=6.07=pl526_4
- perl-mime-base64=3.15=pl526_1
- perl-mozilla-ca=20180117=pl526_1
- perl-net-http=6.19=pl526_0
- perl-net-ssleay=1.88=pl526h90d6eec_0
- perl-ntlm=1.09=pl526_4
- perl-parent=0.236=pl526_1
- perl-pathtools=3.75=pl526h14c3975_1
- perl-scalar-list-utils=1.52=pl526h516909a_0
- perl-socket=2.027=pl526_1
- perl-storable=3.15=pl526h14c3975_0
- perl-test-requiresinternet=0.05=pl526_0
- perl-time-local=1.28=pl526_1
- perl-try-tiny=0.30=pl526_1
- perl-types-serialiser=1.0=pl526_2
- perl-uri=1.76=pl526_0
- perl-www-robotrules=6.02=pl526_3
- perl-xml-namespacesupport=1.12=pl526_0
- perl-xml-parser=2.44_01=pl526ha1d75be_1002
- perl-xml-sax=1.02=pl526_0
- perl-xml-sax-base=1.09=pl526_0
- perl-xml-sax-expat=0.51=pl526_3
- perl-xml-simple=2.25=pl526_1
- perl-xsloader=0.24=pl526_0
- pip=20.1.1=py_1
- prodigal=2.6.3=h516909a_2
- pyahocorasick=1.4.0=py36h8c4c3a4_1
- pyfaidx=0.5.8=py_1
- pyparsing=2.4.7=pyh9f0ad1d_0
- python=3.6.10=h8356626_1011_cpython
- python-dateutil=2.8.1=py_0
- python_abi=3.6=1_cp36m
- pytz=2020.1=pyh9f0ad1d_0
- readline=8.0=hf8c457e_0
- rgi=5.1.0=py_1
- samtools=1.9=h10a08f8_12
- scipy=1.4.1=py36h2d22cac_3
- seaborn=0.10.1=py_0
- setuptools=47.1.1=py36h9f0ad1d_0
- six=1.15.0=pyh9f0ad1d_0
- sqlite=3.30.1=hcee41ef_0
- statsmodels=0.11.1=py36h8c4c3a4_1
- tbb=2020.1=hc9558a2_0
- tk=8.6.10=hed695b0_0
- tornado=6.0.4=py36h8c4c3a4_1
- wheel=0.34.2=py_1
- xz=5.2.5=h516909a_0
- zlib=1.2.11=h516909a_1006
......@@ -228,3 +228,82 @@ rule plasflow:
"PlasFlow.py --input {input} --output {output}.tmp --threshold {params.threshold} &> {log} && "
"cut -f3,4,6- {output}.tmp > {output} && "
"rm {output}.tmp*"
#######
# RGI #
#######
# RGI input: proteins
# NOTE: one rule per assembly to have a workaround for the issue with file paths
# should be resolved properly later
rule rgi_input_flye:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/flye/lr/merged/no_barcode/assembly.faa"))
output:
temp(os.path.join(RESULTS_DIR, "analysis/rgi/flye.faa"))
shell:
"sed 's/\*$//' {input} > {output}"
rule rgi_input_metaspades_hybrid:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.faa"))
output:
os.path.join(RESULTS_DIR, "analysis/rgi/metaspades_hybrid.faa")
shell:
"sed 's/\*$//' {input} > {output}"
rule rgi_input_metaspades:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades/ONT3_MG_xx_Rashi_S11/final.contigs.faa"))
output:
os.path.join(RESULTS_DIR, "analysis/rgi/metaspades.faa")
shell:
"sed 's/\*$//' {input} > {output}"
rule rgi_input_megahit:
input:
os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/megahit/ONT3_MG_xx_Rashi_S11/final.contigs.faa"))
output:
os.path.join(RESULTS_DIR, "analysis/rgi/megahit.faa")
shell:
"sed 's/\*$//' {input} > {output}"
# RGI DBs
rule rgi_db:
output:
archive=temp(os.path.join(DB_DIR, "rgi/card-data.tar.bz2")),
json=os.path.join(DB_DIR, "rgi/card.json")
log:
os.path.join(DB_DIR, "rgi/rgi.log")
params:
db_url=config["rgi"]["db_url"]
message:
"Download RGI DB data"
shell:
"wget -O {output.archive} {params.db_url} --no-check-certificate &> {log} && "
"tar -C $(dirname {output.archive}) -xvf {output.archive} &>> {log}"
# Run RGI: proteins
rule rgi_prot:
input:
faa=os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.faa"),
db=os.path.join(DB_DIR, "rgi/card.json")
output:
os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.txt")
log:
os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.log")
threads: 10
params:
alignment_tool=config["rgi"]["alignment_tool"],
obname=lambda wildcards, output: os.path.splitext(output[0])[0]
conda:
os.path.join(ENV_DIR, "rgi.yaml")
message:
"AMR prediction w/ RGI: {input}"
shell:
"rgi clean --local &> {log} && "
"rgi load --card_json {input.db} --local &>> {log} && "
"rgi database --version --local &>> {log} && "
# NOTE: https://github.com/arpcard/rgi/issues/93
# need to run the CMD twice
"rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log} || "
"rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log}"
......@@ -83,8 +83,11 @@ if 'analysis' in STEPS:
if "crispr" in ANALYSIS_STEPS:
TARGETS.append("crispr_analysis.done")
# Plasmid prediction
if "plasflow" in ANALYSIS_STEPS:
TARGETS.append("plasflow_analysis.done")
if "plasmids" in ANALYSIS_STEPS:
TARGETS.append("plasmids_analysis.done")
# AMR prediction
if "amr" in ANALYSIS_STEPS:
TARGETS.append("amr_analysis.done")
#else:
# raise Exception('You are not serious. No input data')
......
......@@ -45,8 +45,14 @@ rule CRISPR:
output:
touch('crispr_analysis.done')
rule PLASFLOW:
rule PLASMIDS:
input:
expand(os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.tsv"), assembly=ASSEMBLERS)
output:
touch('plasflow_analysis.done')
\ No newline at end of file
touch('plasmids_analysis.done')
rule AMR:
input:
expand(os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.txt"), assembly=ASSEMBLERS)
output:
touch('amr_analysis.done')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment