diff --git a/2019_GDB/config/CONFIG.yaml b/2019_GDB/config/CONFIG.yaml index 0a99892ecf827268f70eebd4cf6635527eef922f..0499477e5502c1d1788f5c88267530f5c90403d5 100755 --- a/2019_GDB/config/CONFIG.yaml +++ b/2019_GDB/config/CONFIG.yaml @@ -1,8 +1,8 @@ # steps: "assembly_annotation mapping metaT mmseq binning taxonomy analysis" # steps: "binning taxonomy analysis" steps: "analysis" -# analysis_steps: ["cdhit", "mappability", "crispr", "plasflow"] -analysis_steps: ["plasflow"] +# analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"] +analysis_steps: ["plasmids", "amr"] # working directory containing all relevant data, # i.e. prefix for data, results, DBs etc. work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB" @@ -171,4 +171,8 @@ minced: plasflow: threshold: 0.7 # class. prob. threshold minlen: 1000 # rm contigs with length below this threshold - \ No newline at end of file + +# AMR prediction +rgi: + db_url: "https://card.mcmaster.ca/latest/data" + alignment_tool: "DIAMOND" # DIAMOND or BLAST diff --git a/2019_GDB/envs/rgi.yaml b/2019_GDB/envs/rgi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ad8e7f7fe6c0d133aaf98add13916b798eb2787f --- /dev/null +++ b/2019_GDB/envs/rgi.yaml @@ -0,0 +1,139 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_llvm + - bamtools=2.5.1=he860b03_5 + - bedtools=2.29.2=hc088bd4_0 + - biopython=1.72=py36h14c3975_1000 + - blast=2.9.0=pl526h3066fca_4 + - bowtie2=2.3.5.1=py36he513fc3_0 + - bwa=0.7.17=hed695b0_7 + - bzip2=1.0.8=h516909a_2 + - ca-certificates=2020.4.5.1=hecc5488_0 + - certifi=2020.4.5.1=py36h9f0ad1d_0 + - curl=7.69.1=h33f0ec9_0 + - cycler=0.10.0=py_2 + - diamond=0.8.36=h8b12597_4 + - entrez-direct=13.3=pl526h375a9b1_0 + - expat=2.2.9=he1b5a44_2 + - filetype=1.0.7=pyh9f0ad1d_0 + - freetype=2.10.2=he06d7ca_0 + - htslib=1.9=h4da6232_3 + - icu=64.2=he1b5a44_1 + - kiwisolver=1.2.0=py36hdb11119_0 + - krb5=1.17.1=h2fd8d38_0 + - ld_impl_linux-64=2.34=h53a641e_4 + - libblas=3.8.0=16_openblas + - libcblas=3.8.0=16_openblas + - libcurl=7.69.1=hf7181ac_0 + - libdeflate=1.6=h516909a_0 + - libedit=3.1.20191231=h46ee950_0 + - libffi=3.2.1=he1b5a44_1007 + - libgcc-ng=9.2.0=h24d8f2e_2 + - libgfortran-ng=7.5.0=hdf63c60_6 + - liblapack=3.8.0=16_openblas + - libopenblas=0.3.9=h5ec1e0e_0 + - libpng=1.6.37=hed695b0_1 + - libssh2=1.9.0=hab1572f_2 + - libstdcxx-ng=9.2.0=hdf63c60_2 + - llvm-openmp=10.0.0=hc9558a2_0 + - matplotlib-base=3.2.1=py36hb8e4980_0 + - ncurses=6.1=hf484d3e_1002 + - numpy=1.18.4=py36h7314795_0 + - oligoarrayaux=3.8=hc9558a2_0 + - openssl=1.1.1g=h516909a_0 + - pandas=1.0.4=py36h830a2c2_0 + - patsy=0.5.1=py_0 + - pcre=8.44=he1b5a44_0 + - perl=5.26.2=h516909a_1006 + - perl-app-cpanminus=1.7044=pl526_1 + - perl-archive-tar=2.32=pl526_0 + - perl-base=2.23=pl526_1 + - perl-business-isbn=3.004=pl526_0 + - perl-business-isbn-data=20140910.003=pl526_0 + - perl-carp=1.38=pl526_3 + - perl-common-sense=3.74=pl526_2 + - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0 + - perl-compress-raw-zlib=2.087=pl526hc9558a2_0 + - perl-constant=1.33=pl526_1 + - perl-data-dumper=2.173=pl526_0 + - perl-digest-hmac=1.03=pl526_3 + - perl-digest-md5=2.55=pl526_0 + - perl-encode=2.88=pl526_1 + - perl-encode-locale=1.05=pl526_6 + - perl-exporter=5.72=pl526_1 + - perl-exporter-tiny=1.002001=pl526_0 + - perl-extutils-makemaker=7.36=pl526_1 + - perl-file-listing=6.04=pl526_1 + - perl-file-path=2.16=pl526_0 + - perl-file-temp=0.2304=pl526_2 + - perl-html-parser=3.72=pl526h6bb024c_5 + - perl-html-tagset=3.20=pl526_3 + - perl-html-tree=5.07=pl526_1 + - perl-http-cookies=6.04=pl526_0 + - perl-http-daemon=6.01=pl526_1 + - perl-http-date=6.02=pl526_3 + - perl-http-message=6.18=pl526_0 + - perl-http-negotiate=6.01=pl526_3 + - perl-io-compress=2.087=pl526he1b5a44_0 + - perl-io-html=1.001=pl526_2 + - perl-io-socket-ssl=2.066=pl526_0 + - perl-io-zlib=1.10=pl526_2 + - perl-json=4.02=pl526_0 + - perl-json-xs=2.34=pl526h6bb024c_3 + - perl-libwww-perl=6.39=pl526_0 + - perl-list-moreutils=0.428=pl526_1 + - perl-list-moreutils-xs=0.428=pl526_0 + - perl-lwp-mediatypes=6.04=pl526_0 + - perl-lwp-protocol-https=6.07=pl526_4 + - perl-mime-base64=3.15=pl526_1 + - perl-mozilla-ca=20180117=pl526_1 + - perl-net-http=6.19=pl526_0 + - perl-net-ssleay=1.88=pl526h90d6eec_0 + - perl-ntlm=1.09=pl526_4 + - perl-parent=0.236=pl526_1 + - perl-pathtools=3.75=pl526h14c3975_1 + - perl-scalar-list-utils=1.52=pl526h516909a_0 + - perl-socket=2.027=pl526_1 + - perl-storable=3.15=pl526h14c3975_0 + - perl-test-requiresinternet=0.05=pl526_0 + - perl-time-local=1.28=pl526_1 + - perl-try-tiny=0.30=pl526_1 + - perl-types-serialiser=1.0=pl526_2 + - perl-uri=1.76=pl526_0 + - perl-www-robotrules=6.02=pl526_3 + - perl-xml-namespacesupport=1.12=pl526_0 + - perl-xml-parser=2.44_01=pl526ha1d75be_1002 + - perl-xml-sax=1.02=pl526_0 + - perl-xml-sax-base=1.09=pl526_0 + - perl-xml-sax-expat=0.51=pl526_3 + - perl-xml-simple=2.25=pl526_1 + - perl-xsloader=0.24=pl526_0 + - pip=20.1.1=py_1 + - prodigal=2.6.3=h516909a_2 + - pyahocorasick=1.4.0=py36h8c4c3a4_1 + - pyfaidx=0.5.8=py_1 + - pyparsing=2.4.7=pyh9f0ad1d_0 + - python=3.6.10=h8356626_1011_cpython + - python-dateutil=2.8.1=py_0 + - python_abi=3.6=1_cp36m + - pytz=2020.1=pyh9f0ad1d_0 + - readline=8.0=hf8c457e_0 + - rgi=5.1.0=py_1 + - samtools=1.9=h10a08f8_12 + - scipy=1.4.1=py36h2d22cac_3 + - seaborn=0.10.1=py_0 + - setuptools=47.1.1=py36h9f0ad1d_0 + - six=1.15.0=pyh9f0ad1d_0 + - sqlite=3.30.1=hcee41ef_0 + - statsmodels=0.11.1=py36h8c4c3a4_1 + - tbb=2020.1=hc9558a2_0 + - tk=8.6.10=hed695b0_0 + - tornado=6.0.4=py36h8c4c3a4_1 + - wheel=0.34.2=py_1 + - xz=5.2.5=h516909a_0 + - zlib=1.2.11=h516909a_1006 + diff --git a/2019_GDB/rules/ANALYSIS_RULES b/2019_GDB/rules/ANALYSIS_RULES index 52847d8b40ba13a731d1b23399d7af1f2d52940b..b1811a2e9b4c3ecec23375c6357666c2341900d3 100755 --- a/2019_GDB/rules/ANALYSIS_RULES +++ b/2019_GDB/rules/ANALYSIS_RULES @@ -228,3 +228,82 @@ rule plasflow: "PlasFlow.py --input {input} --output {output}.tmp --threshold {params.threshold} &> {log} && " "cut -f3,4,6- {output}.tmp > {output} && " "rm {output}.tmp*" + +####### +# RGI # +####### +# RGI input: proteins +# NOTE: one rule per assembly to have a workaround for the issue with file paths +# should be resolved properly later +rule rgi_input_flye: + input: + os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/flye/lr/merged/no_barcode/assembly.faa")) + output: + temp(os.path.join(RESULTS_DIR, "analysis/rgi/flye.faa")) + shell: + "sed 's/\*$//' {input} > {output}" + +rule rgi_input_metaspades_hybrid: + input: + os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.faa")) + output: + os.path.join(RESULTS_DIR, "analysis/rgi/metaspades_hybrid.faa") + shell: + "sed 's/\*$//' {input} > {output}" + +rule rgi_input_metaspades: + input: + os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades/ONT3_MG_xx_Rashi_S11/final.contigs.faa")) + output: + os.path.join(RESULTS_DIR, "analysis/rgi/metaspades.faa") + shell: + "sed 's/\*$//' {input} > {output}" + +rule rgi_input_megahit: + input: + os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/megahit/ONT3_MG_xx_Rashi_S11/final.contigs.faa")) + output: + os.path.join(RESULTS_DIR, "analysis/rgi/megahit.faa") + shell: + "sed 's/\*$//' {input} > {output}" + +# RGI DBs +rule rgi_db: + output: + archive=temp(os.path.join(DB_DIR, "rgi/card-data.tar.bz2")), + json=os.path.join(DB_DIR, "rgi/card.json") + log: + os.path.join(DB_DIR, "rgi/rgi.log") + params: + db_url=config["rgi"]["db_url"] + message: + "Download RGI DB data" + shell: + "wget -O {output.archive} {params.db_url} --no-check-certificate &> {log} && " + "tar -C $(dirname {output.archive}) -xvf {output.archive} &>> {log}" + +# Run RGI: proteins +rule rgi_prot: + input: + faa=os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.faa"), + db=os.path.join(DB_DIR, "rgi/card.json") + output: + os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.txt") + log: + os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.log") + threads: 10 + params: + alignment_tool=config["rgi"]["alignment_tool"], + obname=lambda wildcards, output: os.path.splitext(output[0])[0] + conda: + os.path.join(ENV_DIR, "rgi.yaml") + message: + "AMR prediction w/ RGI: {input}" + shell: + "rgi clean --local &> {log} && " + "rgi load --card_json {input.db} --local &>> {log} && " + "rgi database --version --local &>> {log} && " + # NOTE: https://github.com/arpcard/rgi/issues/93 + # need to run the CMD twice + "rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log} || " + "rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log}" diff --git a/2019_GDB/updated_SNAKEFILE b/2019_GDB/updated_SNAKEFILE index cbf04a079c35bb88e4b26b4dfe3d43816deb90e6..c2d004e0cd876bb0243daa0574d14b30891a4aad 100755 --- a/2019_GDB/updated_SNAKEFILE +++ b/2019_GDB/updated_SNAKEFILE @@ -83,8 +83,11 @@ if 'analysis' in STEPS: if "crispr" in ANALYSIS_STEPS: TARGETS.append("crispr_analysis.done") # Plasmid prediction - if "plasflow" in ANALYSIS_STEPS: - TARGETS.append("plasflow_analysis.done") + if "plasmids" in ANALYSIS_STEPS: + TARGETS.append("plasmids_analysis.done") + # AMR prediction + if "amr" in ANALYSIS_STEPS: + TARGETS.append("amr_analysis.done") #else: # raise Exception('You are not serious. No input data') diff --git a/2019_GDB/workflows/analysis.smk b/2019_GDB/workflows/analysis.smk index a7eb0672f64a67df27f15d930b5c5c98db731332..30ff77419d591849ead36cb1c65b471cb7fb7615 100755 --- a/2019_GDB/workflows/analysis.smk +++ b/2019_GDB/workflows/analysis.smk @@ -45,8 +45,14 @@ rule CRISPR: output: touch('crispr_analysis.done') -rule PLASFLOW: +rule PLASMIDS: input: expand(os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.tsv"), assembly=ASSEMBLERS) output: - touch('plasflow_analysis.done') \ No newline at end of file + touch('plasmids_analysis.done') + +rule AMR: + input: + expand(os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.txt"), assembly=ASSEMBLERS) + output: + touch('amr_analysis.done')