diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..112189a0a6a312add8be1f3a52ad5d3f87026b5f --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +figures/output/** +figures/data/nanostats_summary.tsv +figures/data/crispr_summary.tsv +**/.snakemake/ +slurm-*.out +*.tmp +*.log \ No newline at end of file diff --git a/2019_GDB/config/CONFIG.yaml b/2019_GDB/config/CONFIG.yaml index aab8911bf4efe9daef1f357fbf111082335ffbd0..ee71f60ae41e9575d056363f158c8e7a1093e954 100755 --- a/2019_GDB/config/CONFIG.yaml +++ b/2019_GDB/config/CONFIG.yaml @@ -1,13 +1,23 @@ -# steps: "assembly_annotation mapping metaT mmseq binning taxonomy analysis" -steps: "binning taxonomy analysis" +steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"] + +# analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"] +analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"] + +# working directory containing all relevant data, +# i.e. prefix for data, results, DBs etc. +work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB" data_dir: "data" results_dir: "results" db_dir: "dbs" + runs: first: "S1_SizeSelected" second: "S3_Gtube" # third: "20181108_0827_test" -assemblers: ["flye"] + +# assemblers: ["flye"] +assemblers: ["flye", "megahit", "metaspades", "metaspades_hybrid"] + p7zip: bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za" threads: 4 @@ -18,8 +28,10 @@ ont_fast5_api: threads: 8 flowcell: "FLO-MIN106" kit: "SQK-LSK109" + #barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"] barcodes: ["no_barcode"] + guppy_cpu: path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin" bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller" @@ -45,10 +57,13 @@ guppy_barcoder: version: "3.4.5+fb1fbfb" records_per_fastq: 8000 threads: 8 + nanostats: + short_reads_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/raw/short_reads" #short_reads_prefix: "/mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Apr/fastq" metaT_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/metaT" + fastp: min_length: 40 minimap2: @@ -97,10 +112,9 @@ metaspades: threads: 28 mmseq2: threads: 24 + # Define sample names -#samples: ["flye", "megahit", "metaspades_hybrid"] -# samples: ["flye", "megahit"] -# samples: ["metaspades_hybrid"] +samples: ["ONT3_MG_xx_Rashi_S11"] binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"] # Hybrid assembler @@ -120,10 +134,11 @@ kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/" # Path to DAS_Tool DAS_Tool: - path: "/home/users/sbusi/apps/DAS_Tool-master" - bin: "/home/users/sbusi/apps/DAS_Tool-master/src/" + path: "/home/users/sbusi/apps/DAS_Tool-master" + bin: "/home/users/sbusi/apps/DAS_Tool-master/src/" # Path to DAS_Tool database +# TODO: mv to DAS_Tool dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/" # Mapping options @@ -145,18 +160,29 @@ GTDBTK: DATA: "/home/users/sbusi/apps/db/gtdbtk/release89" # Rscript path +# TODO: mv to DAS_Tool Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/" +# XXX mmseqs: - path: "/home/users/sbusi/apps/mmseqs/bin" - createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb" - rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh" - convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis" + path: "/home/users/sbusi/apps/mmseqs/bin" + createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb" + rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh" + convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis" -# Paths to CASC and minced for CRISPR +# CRISPR CASC: PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin" PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl" - minced: PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/" + +# Plasmid prediction +plasflow: + threshold: 0.7 # class. prob. threshold + minlen: 1000 # rm contigs with length below this threshold + +# AMR prediction +rgi: + db_url: "https://card.mcmaster.ca/latest/data" + alignment_tool: "DIAMOND" # DIAMOND or BLAST diff --git a/2019_GDB/envs/plasflow.yaml b/2019_GDB/envs/plasflow.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ecce806232735d371ddb0e63b73190603af06c87 --- /dev/null +++ b/2019_GDB/envs/plasflow.yaml @@ -0,0 +1,90 @@ +channels: + # for PlasFlow + - smaegol + # rest + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_llvm + - _r-mutex=1.0.1=anacondar_1 + - backports=1.0=py_2 + - backports.weakref=1.0rc1=py35_1 + - bioconductor-biocgenerics=0.22.0=r3.3.2_0 + - bioconductor-biostrings=2.42.1=r3.3.2_0 + - bioconductor-iranges=2.8.2=r3.3.2_0 + - bioconductor-s4vectors=0.12.2=r3.3.2_0 + - bioconductor-xvector=0.14.1=r3.3.2_0 + - bioconductor-zlibbioc=1.20.0=r3.3.2_1 + - biopython=1.68=py35_0 + - blas=2.16=openblas + - bleach=1.5.0=py35_0 + - bzip2=1.0.8=h516909a_2 + - ca-certificates=2020.4.5.1=hecc5488_0 + - cairo=1.14.6=4 + - certifi=2018.8.24=py35_1001 + - curl=7.52.1=0 + - fontconfig=2.12.1=6 + - freetype=2.7=1 + - gettext=0.19.8.1=hc5be6a0_1002 + - glib=2.51.4=0 + - graphite2=1.3.13=he1b5a44_1001 + - gsl=2.6=h294904e_0 + - harfbuzz=1.4.3=0 + - html5lib=0.9999999=py35_0 + - icu=58.2=hf484d3e_1000 + - jpeg=9c=h14c3975_1001 + - libblas=3.8.0=16_openblas + - libcblas=3.8.0=16_openblas + - libffi=3.2.1=he1b5a44_1007 + - libgcc=7.2.0=h69d50b8_2 + - libgcc-ng=9.2.0=h24d8f2e_2 + - libgfortran-ng=7.3.0=hdf63c60_5 + - libiconv=1.15=h516909a_1006 + - liblapack=3.8.0=16_openblas + - liblapacke=3.8.0=16_openblas + - libopenblas=0.3.9=h5ec1e0e_0 + - libpng=1.6.28=1 + - libstdcxx-ng=9.2.0=hdf63c60_2 + - libtiff=4.0.7=0 + - libxml2=2.9.5=0 + - llvm-openmp=10.0.0=hc9558a2_0 + - markdown=3.2.1=py_0 + - mmtf-python=1.0.2=py35_0 + - mock=2.0.0=py35_0 + - msgpack-python=0.5.6=py35h2d50403_3 + - ncurses=5.9=10 + - numpy=1.15.2=py35h99e49ec_0 + - numpy-base=1.15.2=py35h2f8d375_0 + - olefile=0.46=py_0 + - openssl=1.0.2u=h516909a_0 + - pandas=0.23.4=py35hf8a1672_0 + - pango=1.40.4=0 + - pbr=5.4.2=py_0 + - pcre=8.44=he1b5a44_0 + - pillow=4.3.0=py35_0 + - pip=20.0.2=py_2 + - pixman=0.34.0=h14c3975_1003 + - protobuf=3.4.0=py35_0 + - python=3.5.4=0 + - python-dateutil=2.8.1=py_0 + - pytz=2019.3=py_0 + - r-base=3.3.2=5 + - readline=6.2=0 + - reportlab=3.4.0=py35_0 + - rpy2=2.7.8=py35r3.3.2_1 + - scikit-learn=0.20.0=py35h22eb022_1 + - scipy=1.1.0=py35he2b7bc3_1 + - setuptools=40.4.3=py35_0 + - six=1.14.0=py_1 + - sqlite=3.13.0=1 + - tensorflow=1.3.0=py35_0 + - tk=8.5.19=2 + - webencodings=0.5.1=py_1 + - werkzeug=1.0.1=pyh9f0ad1d_0 + - wheel=0.34.2=py_1 + - xz=5.2.5=h516909a_0 + - zlib=1.2.8=3 + # PlasFlow: should be installed from smaegol, NOT from pip + - plasflow=1.1.0 diff --git a/2019_GDB/envs/rgi.yaml b/2019_GDB/envs/rgi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ad8e7f7fe6c0d133aaf98add13916b798eb2787f --- /dev/null +++ b/2019_GDB/envs/rgi.yaml @@ -0,0 +1,139 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_llvm + - bamtools=2.5.1=he860b03_5 + - bedtools=2.29.2=hc088bd4_0 + - biopython=1.72=py36h14c3975_1000 + - blast=2.9.0=pl526h3066fca_4 + - bowtie2=2.3.5.1=py36he513fc3_0 + - bwa=0.7.17=hed695b0_7 + - bzip2=1.0.8=h516909a_2 + - ca-certificates=2020.4.5.1=hecc5488_0 + - certifi=2020.4.5.1=py36h9f0ad1d_0 + - curl=7.69.1=h33f0ec9_0 + - cycler=0.10.0=py_2 + - diamond=0.8.36=h8b12597_4 + - entrez-direct=13.3=pl526h375a9b1_0 + - expat=2.2.9=he1b5a44_2 + - filetype=1.0.7=pyh9f0ad1d_0 + - freetype=2.10.2=he06d7ca_0 + - htslib=1.9=h4da6232_3 + - icu=64.2=he1b5a44_1 + - kiwisolver=1.2.0=py36hdb11119_0 + - krb5=1.17.1=h2fd8d38_0 + - ld_impl_linux-64=2.34=h53a641e_4 + - libblas=3.8.0=16_openblas + - libcblas=3.8.0=16_openblas + - libcurl=7.69.1=hf7181ac_0 + - libdeflate=1.6=h516909a_0 + - libedit=3.1.20191231=h46ee950_0 + - libffi=3.2.1=he1b5a44_1007 + - libgcc-ng=9.2.0=h24d8f2e_2 + - libgfortran-ng=7.5.0=hdf63c60_6 + - liblapack=3.8.0=16_openblas + - libopenblas=0.3.9=h5ec1e0e_0 + - libpng=1.6.37=hed695b0_1 + - libssh2=1.9.0=hab1572f_2 + - libstdcxx-ng=9.2.0=hdf63c60_2 + - llvm-openmp=10.0.0=hc9558a2_0 + - matplotlib-base=3.2.1=py36hb8e4980_0 + - ncurses=6.1=hf484d3e_1002 + - numpy=1.18.4=py36h7314795_0 + - oligoarrayaux=3.8=hc9558a2_0 + - openssl=1.1.1g=h516909a_0 + - pandas=1.0.4=py36h830a2c2_0 + - patsy=0.5.1=py_0 + - pcre=8.44=he1b5a44_0 + - perl=5.26.2=h516909a_1006 + - perl-app-cpanminus=1.7044=pl526_1 + - perl-archive-tar=2.32=pl526_0 + - perl-base=2.23=pl526_1 + - perl-business-isbn=3.004=pl526_0 + - perl-business-isbn-data=20140910.003=pl526_0 + - perl-carp=1.38=pl526_3 + - perl-common-sense=3.74=pl526_2 + - perl-compress-raw-bzip2=2.087=pl526he1b5a44_0 + - perl-compress-raw-zlib=2.087=pl526hc9558a2_0 + - perl-constant=1.33=pl526_1 + - perl-data-dumper=2.173=pl526_0 + - perl-digest-hmac=1.03=pl526_3 + - perl-digest-md5=2.55=pl526_0 + - perl-encode=2.88=pl526_1 + - perl-encode-locale=1.05=pl526_6 + - perl-exporter=5.72=pl526_1 + - perl-exporter-tiny=1.002001=pl526_0 + - perl-extutils-makemaker=7.36=pl526_1 + - perl-file-listing=6.04=pl526_1 + - perl-file-path=2.16=pl526_0 + - perl-file-temp=0.2304=pl526_2 + - perl-html-parser=3.72=pl526h6bb024c_5 + - perl-html-tagset=3.20=pl526_3 + - perl-html-tree=5.07=pl526_1 + - perl-http-cookies=6.04=pl526_0 + - perl-http-daemon=6.01=pl526_1 + - perl-http-date=6.02=pl526_3 + - perl-http-message=6.18=pl526_0 + - perl-http-negotiate=6.01=pl526_3 + - perl-io-compress=2.087=pl526he1b5a44_0 + - perl-io-html=1.001=pl526_2 + - perl-io-socket-ssl=2.066=pl526_0 + - perl-io-zlib=1.10=pl526_2 + - perl-json=4.02=pl526_0 + - perl-json-xs=2.34=pl526h6bb024c_3 + - perl-libwww-perl=6.39=pl526_0 + - perl-list-moreutils=0.428=pl526_1 + - perl-list-moreutils-xs=0.428=pl526_0 + - perl-lwp-mediatypes=6.04=pl526_0 + - perl-lwp-protocol-https=6.07=pl526_4 + - perl-mime-base64=3.15=pl526_1 + - perl-mozilla-ca=20180117=pl526_1 + - perl-net-http=6.19=pl526_0 + - perl-net-ssleay=1.88=pl526h90d6eec_0 + - perl-ntlm=1.09=pl526_4 + - perl-parent=0.236=pl526_1 + - perl-pathtools=3.75=pl526h14c3975_1 + - perl-scalar-list-utils=1.52=pl526h516909a_0 + - perl-socket=2.027=pl526_1 + - perl-storable=3.15=pl526h14c3975_0 + - perl-test-requiresinternet=0.05=pl526_0 + - perl-time-local=1.28=pl526_1 + - perl-try-tiny=0.30=pl526_1 + - perl-types-serialiser=1.0=pl526_2 + - perl-uri=1.76=pl526_0 + - perl-www-robotrules=6.02=pl526_3 + - perl-xml-namespacesupport=1.12=pl526_0 + - perl-xml-parser=2.44_01=pl526ha1d75be_1002 + - perl-xml-sax=1.02=pl526_0 + - perl-xml-sax-base=1.09=pl526_0 + - perl-xml-sax-expat=0.51=pl526_3 + - perl-xml-simple=2.25=pl526_1 + - perl-xsloader=0.24=pl526_0 + - pip=20.1.1=py_1 + - prodigal=2.6.3=h516909a_2 + - pyahocorasick=1.4.0=py36h8c4c3a4_1 + - pyfaidx=0.5.8=py_1 + - pyparsing=2.4.7=pyh9f0ad1d_0 + - python=3.6.10=h8356626_1011_cpython + - python-dateutil=2.8.1=py_0 + - python_abi=3.6=1_cp36m + - pytz=2020.1=pyh9f0ad1d_0 + - readline=8.0=hf8c457e_0 + - rgi=5.1.0=py_1 + - samtools=1.9=h10a08f8_12 + - scipy=1.4.1=py36h2d22cac_3 + - seaborn=0.10.1=py_0 + - setuptools=47.1.1=py36h9f0ad1d_0 + - six=1.15.0=pyh9f0ad1d_0 + - sqlite=3.30.1=hcee41ef_0 + - statsmodels=0.11.1=py36h8c4c3a4_1 + - tbb=2020.1=hc9558a2_0 + - tk=8.6.10=hed695b0_0 + - tornado=6.0.4=py36h8c4c3a4_1 + - wheel=0.34.2=py_1 + - xz=5.2.5=h516909a_0 + - zlib=1.2.11=h516909a_1006 + diff --git a/2019_GDB/envs/snakemake.yaml b/2019_GDB/envs/snakemake.yaml index f6777ba72734cfc1b550a27bf4a7066179256100..b728fca61b3108cda90d81c0fcda0358a297957c 100755 --- a/2019_GDB/envs/snakemake.yaml +++ b/2019_GDB/envs/snakemake.yaml @@ -1,4 +1,4 @@ -name: snakemake +name: ONT_pilot channels: - conda-forge - bioconda @@ -169,5 +169,3 @@ dependencies: - zipp=2.1.0 - zlib=1.2.11 - zstd=1.4.4 -prefix: /home/users/sbusi/apps/miniconda3/envs/snakemake - diff --git a/2019_GDB/rules/ANALYSIS_RULES b/2019_GDB/rules/ANALYSIS_RULES index 9e8b8cfc2baab4c8388e8c60cd6f16965d3fd4f4..67402f7bce2ab2bd4095fd66ff0fa0c98459a10d 100755 --- a/2019_GDB/rules/ANALYSIS_RULES +++ b/2019_GDB/rules/ANALYSIS_RULES @@ -6,22 +6,6 @@ # 5. CRISPRS - via 'minced' and 'CASC' # 6. Plasmids -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -# ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -# SAMPLES=config["samples"] -SAMPLES=["ONT3_MG_xx_Rashi_S11"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - ############################################ ######## Partial_vs_unique_ORFs ############ ############################################ @@ -190,3 +174,122 @@ rule minced: date) &> >(tee {log}) """ +############ +# PlasFlow # +############ +# Filter input FASTA by seq. length +rule plasflow_input: + input: + os.path.join(RESULTS_DIR, "assembly/{assembly}.fa") + output: + temp(os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.fna")) + log: + os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.fna.log") + params: + script=os.path.join(SRC_DIR, "filter_fasta_by_length.pl"), + minlen=config["plasflow"]["minlen"] + message: + "Plasmid prediction w/ PlasFlow: {input}" + shell: + # script from PathoFact + "{params.script} {params.minlen} {input} > {output} 2> {log}" + +# Run PlasFlow +rule plasflow: + input: + os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.fna") + output: + os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.tsv") + log: + os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.tsv.log") + params: + threshold=config["plasflow"]["threshold"] + conda: + os.path.join(ENV_DIR, "plasflow.yaml") + message: + "Plasmid prediction w/ PlasFlow: {input}" + shell: + "PlasFlow.py --input {input} --output {output}.tmp --threshold {params.threshold} &> {log} && " + "cut -f3,4,6- {output}.tmp > {output} && " + "rm {output}.tmp*" + +####### +# RGI # +####### +# RGI input: proteins +# NOTE: remove stop codon symbol "*" +# NOTE: one rule per assembly to have a workaround for the issue with file paths +# should be resolved properly later +rule rgi_input_flye: + input: + os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/flye/lr/merged/no_barcode/assembly.faa")) + output: + temp(os.path.join(RESULTS_DIR, "analysis/rgi/flye.faa")) + shell: + "sed 's/\*$//' {input} > {output}" + +rule rgi_input_metaspades_hybrid: + input: + os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.faa")) + output: + temp(os.path.join(RESULTS_DIR, "analysis/rgi/metaspades_hybrid.faa")) + shell: + "sed 's/\*$//' {input} > {output}" + +rule rgi_input_metaspades: + input: + os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/metaspades/ONT3_MG_xx_Rashi_S11/final.contigs.faa")) + output: + temp(os.path.join(RESULTS_DIR, "analysis/rgi/metaspades.faa")) + shell: + "sed 's/\*$//' {input} > {output}" + +rule rgi_input_megahit: + input: + os.path.abspath(os.path.join(RESULTS_DIR, "annotation/proteins/megahit/ONT3_MG_xx_Rashi_S11/final.contigs.faa")) + output: + temp(os.path.join(RESULTS_DIR, "analysis/rgi/megahit.faa")) + shell: + "sed 's/\*$//' {input} > {output}" + +# RGI DBs +rule rgi_db: + output: + archive=temp(os.path.join(DB_DIR, "rgi/card-data.tar.bz2")), + json=os.path.join(DB_DIR, "rgi/card.json") + log: + os.path.join(DB_DIR, "rgi/rgi.log") + params: + db_url=config["rgi"]["db_url"] + message: + "Download RGI DB data" + shell: + "wget -O {output.archive} {params.db_url} --no-check-certificate &> {log} && " + "tar -C $(dirname {output.archive}) -xvf {output.archive} &>> {log}" + +# Run RGI: proteins +rule rgi_prot: + input: + faa=os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.faa"), + db=os.path.join(DB_DIR, "rgi/card.json") + output: + os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.txt") + log: + os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.log") + threads: 10 + params: + alignment_tool=config["rgi"]["alignment_tool"], + obname=lambda wildcards, output: os.path.splitext(output[0])[0] + conda: + os.path.join(ENV_DIR, "rgi.yaml") + message: + "AMR prediction w/ RGI: {input}" + shell: + # NOTE: to make sure that the correct DB is used + "rgi clean --local &> {log} && " + "rgi load --card_json {input.db} --local &>> {log} && " + "rgi database --version --local &>> {log} && " + # NOTE: https://github.com/arpcard/rgi/issues/93: KeyError: 'snp' + # need to run the CMD twice + "rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log} || " + "rgi main --input_sequence {input.faa} --output_file {params.obname} --input_type protein --local -a {params.alignment_tool} --clean -n {threads} &>> {log}" diff --git a/2019_GDB/rules/ASSEMBLY_ANNOTATION_RULES b/2019_GDB/rules/ASSEMBLY_ANNOTATION_RULES index a3c1c5ad6bfb285ccce509d897491ce28df6fe86..1af624b0792a6756142270085e3bacb822a2eb86 100755 --- a/2019_GDB/rules/ASSEMBLY_ANNOTATION_RULES +++ b/2019_GDB/rules/ASSEMBLY_ANNOTATION_RULES @@ -1,26 +1,5 @@ # For running the ASSEMBLY and ANNOTATION workflow for ONT data -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -RUNS=[config["runs"]["first"], - config["runs"]["second"]] -# config["runs"]["third"]] -BARCODES=config["barcodes"] -SAMPLES=["ONT3_MG_xx_Rashi_S11"] -sr_sample=["ONT3_MG_xx_Rashi_S11"] -# SAMPLES_ALL=config["samples"] -REFERENCES=["igc", "hg38"] -IGC_URI=config["igc"]["uri"] -HG38_URI=config["hg38"]["uri"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa_mem", "minimap2"] - - ###### # RULES ###### diff --git a/2019_GDB/rules/BINNING_RULES b/2019_GDB/rules/BINNING_RULES index 0b38b5ca3cae6fec3c80ce5339bfe5754e0e6fc2..102adaa745ab032644a8a701eb1072f72da07f95 100755 --- a/2019_GDB/rules/BINNING_RULES +++ b/2019_GDB/rules/BINNING_RULES @@ -1,21 +1,5 @@ # Rules for BINNING workflow, i.e. generating MAGs from assemblies -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -#SAMPLES=config["samples"] -SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - ################### # Preparing files # ################### diff --git a/2019_GDB/rules/MAPPING_RULES b/2019_GDB/rules/MAPPING_RULES index 0827a17e3002a22c9942fee6e9baeee8892a7282..83af0b7785c4dc8334457ca7b95e8c022f86b5a6 100755 --- a/2019_GDB/rules/MAPPING_RULES +++ b/2019_GDB/rules/MAPPING_RULES @@ -1,23 +1,5 @@ # Rule for running the mapping "workflow" for ONT analsyes -#shell.executable("/bin/bash") -#shell.prefix("source ~/.bashrc; ") -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -#SAMPLES=config["samples"] -SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - ####################### ## Mapping to hybrid ## ####################### diff --git a/2019_GDB/rules/METAT_RULES b/2019_GDB/rules/METAT_RULES index 636ea21c9649bfa82c31e132d14e56d9b5dca24f..157930ad403f26e60a1bddc302db83d62c1453ea 100755 --- a/2019_GDB/rules/METAT_RULES +++ b/2019_GDB/rules/METAT_RULES @@ -1,20 +1,5 @@ # Rules for mapping the metaT reads to the different assemblies -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -# SAMPLES=config["samples"] -SAMPLES=["flye", "megathit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - ########### ## metaT ## ########### diff --git a/2019_GDB/rules/MMSEQ_RULES b/2019_GDB/rules/MMSEQ_RULES index 98a5c84caf8b5afc9cad24746e34a3bd642fb61e..93d1c2787a2b345ac73d4a868701ad40cbddd3a8 100755 --- a/2019_GDB/rules/MMSEQ_RULES +++ b/2019_GDB/rules/MMSEQ_RULES @@ -1,21 +1,5 @@ # For running the MMSEQ2 comparison of proteins after assemblies are run through prokka/prodigal -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mm"] -# SAMPLES=config["samples"] -SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - ############################# ######## MMSEQ2 ############ ############################# diff --git a/2019_GDB/rules/PLOT_RULES b/2019_GDB/rules/PLOT_RULES index fdbb71a6d52dfb2bb6515d45020fe5d7a0cb8823..88b0e1b22f4280e42129d0a19003eaef35089c81 100755 --- a/2019_GDB/rules/PLOT_RULES +++ b/2019_GDB/rules/PLOT_RULES @@ -1,29 +1,12 @@ # For running the MMSEQ2 comparison of proteins after assemblies are run through prokka/prodigal -import os -#from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -#DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -#DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -#ASSEMBLERS=config["assemblers"] -#MAPPERS=["bwa", "mm"] -## SAMPLES=config["samples"] -#SAMPLES=["flye", "megahit", "metaspades_hybrid"] -#BINNING_SAMPLES=config["binning_samples"] -#HYBRID_ASSEMBLER=config["hybrid_assembler"] -SR_PREFIX="ONT3_MG_xx_Rashi_S11" - - ############################# ###### IGC COVERAGE ######## ############################# rule all: input: - expand("{results_dir}/plots/genomecov/{sr_prefix}_vs_{lr_prefix}-x-{reference}_coverage.html", results_dir=RESULTS_DIR, sr_prefix=SR_PREFIX, lr_prefix=BARCODES, reference="igc"), - expand("{results_dir}/plots/annotation/diamond/lr_{lr_prefix}-sr{sr_prefix}-gene_length_ratio.html", results_dir=RESULTS_DIR, sr_prefix=SR_PREFIX, lr_prefix=BARCODES) + expand("{results_dir}/plots/genomecov/{sr_prefix}_vs_{lr_prefix}-x-{reference}_coverage.html", results_dir=RESULTS_DIR, sr_prefix=SAMPLES, lr_prefix=BARCODES, reference="igc"), + expand("{results_dir}/plots/annotation/diamond/lr_{lr_prefix}-sr{sr_prefix}-gene_length_ratio.html", results_dir=RESULTS_DIR, sr_prefix=SAMPLES, lr_prefix=BARCODES) rule igc_correlation_plot: diff --git a/2019_GDB/rules/TAXONOMY_RULES b/2019_GDB/rules/TAXONOMY_RULES index bdc76d9ea6e1a0525b9cc1f371079a86246c51c0..a10e690b81ec8c02f0d893276200cd2e65a710ca 100755 --- a/2019_GDB/rules/TAXONOMY_RULES +++ b/2019_GDB/rules/TAXONOMY_RULES @@ -1,21 +1,5 @@ # For running the contamination check and taxonomy on the generated MAGs -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -#SAMPLES=config["samples"] -SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - ##################################### ######## GTDBTK & CHECKM ############ ##################################### diff --git a/2019_GDB/rules/checkpoint_ASSEMBLY_ANNOTATION_RULES b/2019_GDB/rules/checkpoint_ASSEMBLY_ANNOTATION_RULES index 24bdf257b4c009cc7dc8701b8a261d38adaa29b3..c33efa4ebcd0263f9f6e37a0fc4d4e12d7fa065a 100755 --- a/2019_GDB/rules/checkpoint_ASSEMBLY_ANNOTATION_RULES +++ b/2019_GDB/rules/checkpoint_ASSEMBLY_ANNOTATION_RULES @@ -1,26 +1,5 @@ # For running the ASSEMBLY and ANNOTATION workflow for ONT data -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -RUNS=[config["runs"]["first"], - config["runs"]["second"]] -# config["runs"]["third"]] -BARCODES=config["barcodes"] -SAMPLES=["ONT3_MG_xx_Rashi_S11"] -sr_sample=["ONT3_MG_xx_Rashi_S11"] -# SAMPLES_ALL=config["samples"] -REFERENCES=["igc", "hg38"] -IGC_URI=config["igc"]["uri"] -HG38_URI=config["hg38"]["uri"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa_mem", "minimap2"] - - ###### # RULES ###### diff --git a/2019_GDB/scripts/filter_fasta_by_length.pl b/2019_GDB/scripts/filter_fasta_by_length.pl new file mode 100755 index 0000000000000000000000000000000000000000..1643e1bc2b13f4299570475ed8b6e05eb4b9c30f --- /dev/null +++ b/2019_GDB/scripts/filter_fasta_by_length.pl @@ -0,0 +1,21 @@ +#!/usr/bin/env perl + +# from https://www.biostars.org/p/79202/#79467 + +use strict; +use warnings; + +my $minlen = shift or die "Error: `minlen` parameter not provided\n"; +{ + local $/=">"; + while(<>) { + chomp; + next unless /\w/; + s/>$//gs; + my @chunk = split /\n/; + my $header = shift @chunk; + my $seqlen = length join "", @chunk; + print ">$_" if($seqlen >= $minlen); + } + local $/="\n"; +} diff --git a/2019_GDB/updated_SNAKEFILE b/2019_GDB/updated_SNAKEFILE index 7290cb83f562164795ef3eacfc9bdd2f00f4a4f3..26e428dc5c38e225945172bb35e332c2e0aa64d3 100755 --- a/2019_GDB/updated_SNAKEFILE +++ b/2019_GDB/updated_SNAKEFILE @@ -1,14 +1,51 @@ # File for running ONT analyses -# default configuration file +############################## +# MODULES +import os +from tempfile import TemporaryDirectory + +############################## +# CONFIG configfile:"config/CONFIG.yaml" +# Paths +SRC_DIR = srcdir("scripts") +ENV_DIR = srcdir("envs") +DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +DB_DIR = config["db_dir"] + +# Steps +STEPS = config['steps'] +ANALYSIS_STEPS = config["analysis_steps"] + +# Input +BARCODES = config["barcodes"] +RUNS = [ + config["runs"]["first"], + config["runs"]["second"], +# config["runs"]["third"] +] +SAMPLES = config["samples"] +BINNING_SAMPLES = config["binning_samples"] + +# References +IGC_URI = config["igc"]["uri"] +HG38_URI = config["hg38"]["uri"] +REFERENCES = ["igc", "hg38"] + +# Tools +ASSEMBLERS = config["assemblers"] +HYBRID_ASSEMBLER = config["hybrid_assembler"] +MAPPERS = ["bwa", "mmi"] + # default executable for snakmake shell.executable("bash") -# input settings -RUNS=config['runs']['first'] -STEPS=config['steps'] +# working directory +workdir: + config["work_dir"] # include rules for the workflows based on "steps" in the CONFIG.yaml file # ONT analyses workflow @@ -56,9 +93,21 @@ if 'taxonomy' in STEPS: if 'analysis' in STEPS: include: "workflows/analysis.smk" - TARGETS += ["cdhit_analysis.done", - "mappability_index.done", - "crispr_analysis.done"] + # CD-HIT + if "cdhit" in ANALYSIS_STEPS: + TARGETS.append("cdhit_analysis.done") + # Mappability + if "mappability" in ANALYSIS_STEPS: + TARGETS.append("mappability_index.done") + # CRISPR + if "crispr" in ANALYSIS_STEPS: + TARGETS.append("crispr_analysis.done") + # Plasmid prediction + if "plasmids" in ANALYSIS_STEPS: + TARGETS.append("plasmids_analysis.done") + # AMR prediction + if "amr" in ANALYSIS_STEPS: + TARGETS.append("amr_analysis.done") #else: # raise Exception('You are not serious. No input data') diff --git a/2019_GDB/workflows/analysis.smk b/2019_GDB/workflows/analysis.smk index 690ebb537a185dfa3f6f07095135e898197f5141..064d6ba49dcfaa7968b3e702f0f176d681996b52 100755 --- a/2019_GDB/workflows/analysis.smk +++ b/2019_GDB/workflows/analysis.smk @@ -1,21 +1,5 @@ # For running the ANALYSIS "workflow" -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -#ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -# SAMPLES=config["samples"] -#SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - # specify which rules to run include: '../rules/ANALYSIS_RULES' @@ -44,3 +28,15 @@ rule CRISPR: expand(os.path.join(RESULTS_DIR, "analysis/crispr/minced/{assembly}.txt"), assembly=["flye", "megahit", "metaspades", "metaspades_hybrid"]) output: touch('crispr_analysis.done') + +rule PLASMIDS: + input: + expand(os.path.join(RESULTS_DIR, "analysis/plasflow/{assembly}.tsv"), assembly=ASSEMBLERS) + output: + touch('plasmids_analysis.done') + +rule AMR: + input: + expand(os.path.join(RESULTS_DIR, "analysis/rgi/{assembly}.txt"), assembly=ASSEMBLERS) + output: + touch('amr_analysis.done') diff --git a/2019_GDB/workflows/assembly_annotation.smk b/2019_GDB/workflows/assembly_annotation.smk index 51cd000b216a52a1f5a1771d17f69e9a17248b6a..f26ad8f0b3442f60d6cc65469a526d036add908f 100755 --- a/2019_GDB/workflows/assembly_annotation.smk +++ b/2019_GDB/workflows/assembly_annotation.smk @@ -1,25 +1,5 @@ # For running the ASSEMBLY and ANNOTATION part of the ONT pipeline -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -RUNS=[config["runs"]["first"], - config["runs"]["second"]] -# config["runs"]["third"]] -##RUNS=config["runs"]["third"] -BARCODES=config["barcodes"] -SAMPLES=["ONT3_MG_xx_Rashi_S11"] -# SAMPLES_ALL=config["samples"] -REFERENCES=["igc", "hg38"] -IGC_URI=config["igc"]["uri"] -HG38_URI=config["hg38"]["uri"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa_mem", "minimap2"] - # specify which rules to run include: '../rules/ASSEMBLY_ANNOTATION_RULES' @@ -46,8 +26,8 @@ rule BASECALL_MERGE_QC: expand(os.path.join(RESULTS_DIR, "qc/lr/merged/{barcode}/{barcode}NanoStats.txt"), barcode=BARCODES), expand(os.path.join(RESULTS_DIR, "qc/lr/{run}/{barcode}/{barcode}NanoStats.txt"), run=RUNS, barcode=BARCODES), # expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}_R1_001.fastp.fq.gz"), sr_sample=["ONT3_MG_xx_Rashi_S11"]), - expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}.fastp.{report_type}"), sr_sample="ONT3_MG_xx_Rashi_S11", report_type=["html", "json"]), - expand(os.path.join(RESULTS_DIR, "qc/sr/fastqc/{sr_sample}/{sr_sample}_R{orientation}_001.fastp.fastqc.html"), sr_sample="ONT3_MG_xx_Rashi_S11", orientation=["1", "2"]) + expand(os.path.join(RESULTS_DIR, "preprocessing/sr/{sr_sample}.fastp.{report_type}"), sr_sample=SAMPLES, report_type=["html", "json"]), + expand(os.path.join(RESULTS_DIR, "qc/sr/fastqc/{sr_sample}/{sr_sample}_R{orientation}_001.fastp.fastqc.html"), sr_sample=SAMPLES, orientation=["1", "2"]) output: "basecall_merge_qc.done" shell: """ @@ -67,8 +47,8 @@ rule BASECALL_MERGE_QC_NO_MOD: rule COVERAGE_OF_REFERENCES: input: - expand(os.path.join(RESULTS_DIR, "genomecov/sr/minimap2/{sr_sample}-x-{reference}.avg_cov.txt"), sr_sample="ONT3_MG_xx_Rashi_S11", reference=REFERENCES), - expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_sample}-x-{reference}.avg_cov.txt"), sr_sample="ONT3_MG_xx_Rashi_S11", reference=REFERENCES), + expand(os.path.join(RESULTS_DIR, "genomecov/sr/minimap2/{sr_sample}-x-{reference}.avg_cov.txt"), sr_sample=SAMPLES, reference=REFERENCES), + expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_sample}-x-{reference}.avg_cov.txt"), sr_sample=SAMPLES, reference=REFERENCES), expand(os.path.join(RESULTS_DIR, "genomecov/lr/merged/{barcode}/{barcode}-x-{reference}.avg_cov.txt"), barcode=BARCODES, reference=REFERENCES) output: "coverage_of_references.done" shell: @@ -123,11 +103,11 @@ rule ASSEMBLE_AND_COVERAGE: #expand(os.path.join(RESULTS_DIR, "genomecov/lr/bwa_mem/{barcode}_reads-x-lr_{barcode}_sr_{sample}-{assembler}_contigs.avg_cov.txt"), barcode=BARCODES, sample=SAMPLES, assembler="operams"), expand(os.path.join(RESULTS_DIR, "assembly/flye/lr/merged/{barcode}/assembly.fna"), barcode=BARCODES), # short reads on short read contigs - expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_sample}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), sr_sample="ONT3_MG_xx_Rashi_S11", assembler=["megahit", "metaspades"]), + expand(os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_sample}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), sr_sample=SAMPLES, assembler=["megahit", "metaspades"]), # assemble short reads with metaspades - # expand(os.path.join(RESULTS_DIR, "assembly/metaspades/{sr_sample}/contigs.fna"), sr_sample="ONT3_MG_xx_Rashi_S11"), + # expand(os.path.join(RESULTS_DIR, "assembly/metaspades/{sr_sample}/contigs.fna"), sr_sample=SAMPLES), # long reads on short read contigs - expand(os.path.join(RESULTS_DIR, "genomecov/lr/bwa_mem/{barcode}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), barcode=BARCODES, sr_sample="ONT3_MG_xx_Rashi_S11", assembler="megahit") + expand(os.path.join(RESULTS_DIR, "genomecov/lr/bwa_mem/{barcode}_reads-x-{sr_sample}-{assembler}_contigs.avg_cov.txt"), barcode=BARCODES, sr_sample=SAMPLES, assembler="megahit") output: "assemble_and_coverage.done" shell: """ diff --git a/2019_GDB/workflows/binning.smk b/2019_GDB/workflows/binning.smk index d94542207bf225f5d493fa3a4a4d1e78cda72fd5..0ebf77a44b05cbcd1dc0f5421146f0a5c67d575e 100755 --- a/2019_GDB/workflows/binning.smk +++ b/2019_GDB/workflows/binning.smk @@ -1,21 +1,5 @@ # For running the BINNING "workflow" -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -#SAMPLES=config["samples"] -SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - # specify which rules to run include: '../rules/BINNING_RULES' diff --git a/2019_GDB/workflows/checkpoint_assembly_annotation.smk b/2019_GDB/workflows/checkpoint_assembly_annotation.smk index 3e43d008e09d33697493a568d0cc9ac4e0a3229c..0e23f32b93e29da637220463b5e184b15fa3bafc 100755 --- a/2019_GDB/workflows/checkpoint_assembly_annotation.smk +++ b/2019_GDB/workflows/checkpoint_assembly_annotation.smk @@ -1,25 +1,5 @@ # For running the ASSEMBLY and ANNOTATION part of the ONT pipeline -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -RUNS=[config["runs"]["first"], - config["runs"]["second"]] -# config["runs"]["third"]] -##RUNS=config["runs"]["third"] -BARCODES=config["barcodes"] -SAMPLES=["ONT3_MG_xx_Rashi_S11"] -# SAMPLES_ALL=config["samples"] -REFERENCES=["igc", "hg38"] -IGC_URI=config["igc"]["uri"] -HG38_URI=config["hg38"]["uri"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa_mem", "minimap2"] - # specify which rules to run include: '../rules/checkpoint_ASSEMBLY_ANNOTATION_RULES' diff --git a/2019_GDB/workflows/mapping.smk b/2019_GDB/workflows/mapping.smk index d6f22292e7015f580e950836429b73e6c0d0be8b..8b9716973b3b1c9819f512f67de27ad23672149c 100755 --- a/2019_GDB/workflows/mapping.smk +++ b/2019_GDB/workflows/mapping.smk @@ -1,22 +1,5 @@ # Workflow for running mapping steps of different assemblers, and mappers for the Binning workflow -#shell.executable("/bin/bash") -##shell.prefix("source ~/.bashrc; ") -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -#SAMPLES=config["samples"] -SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - # specify which rules to run include: '../rules/MAPPING_RULES' diff --git a/2019_GDB/workflows/metat.smk b/2019_GDB/workflows/metat.smk index a19965eba9c462fbbae63ce4ea01d9ba0b2c8ddd..4854a12e4b563ec54245b3e0e52bfe611c3fb36d 100755 --- a/2019_GDB/workflows/metat.smk +++ b/2019_GDB/workflows/metat.smk @@ -1,20 +1,5 @@ # Workflow for running mapping metaT reads to different assemblies using different mappers for the Binning workflow -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -#Â SAMPLES=config["samples"] -SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - # specify which rules to run include: '../rules/METAT_RULES' diff --git a/2019_GDB/workflows/mmseq.smk b/2019_GDB/workflows/mmseq.smk index 2ba8f8319a62aaf682ff9bc92d22b12a4fcc54e4..272a754fd6a94be274e360e79a11e87d10228bfc 100755 --- a/2019_GDB/workflows/mmseq.smk +++ b/2019_GDB/workflows/mmseq.smk @@ -1,21 +1,5 @@ # For running the MMSEQ2 "workflow" -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -# SAMPLES=config["samples"] -SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - # specify which rules to run include: '../rules/MMSEQ_RULES' @@ -23,7 +7,7 @@ include: # Rule all for running the MMSEQ2 analyses on the proteins after assembly rule MMSEQ: input: - expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler=["flye", "megahit", "metaspades_hybrid", "metaspades"]), + expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/{assembler}_db"), assembler=ASSEMBLERS), expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit_rbh")), expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid_rbh")), expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid_rbh")), diff --git a/2019_GDB/workflows/taxonomy.smk b/2019_GDB/workflows/taxonomy.smk index 733d1ef2433b6e555a7cb4caca5268884339ca7a..6b9e5dde5c7e0b9d7acf945e4e519a76cf3d799d 100755 --- a/2019_GDB/workflows/taxonomy.smk +++ b/2019_GDB/workflows/taxonomy.smk @@ -1,21 +1,5 @@ # For running the TAXONOMY "workflow" -import os -from tempfile import TemporaryDirectory - -configfile: "config/CONFIG.yaml" -DATA_DIR = config["data_dir"] -RESULTS_DIR = config["results_dir"] -DB_DIR=config["db_dir"] -BARCODES=config["barcodes"] -ASSEMBLERS=config["assemblers"] -MAPPERS=["bwa", "mmi"] -#SAMPLES=config["samples"] -SAMPLES=["flye", "megahit", "metaspades_hybrid"] -BINNING_SAMPLES=config["binning_samples"] -HYBRID_ASSEMBLER=config["hybrid_assembler"] - - # specify which rules to run include: '../rules/TAXONOMY_RULES'