Skip to content
Snippets Groups Projects
Commit e08b0f30 authored by Valentina Galata's avatar Valentina Galata
Browse files

added mash analysis (issue #58)

parent 334acc25
No related branches found
No related tags found
1 merge request!76Merge "cleanup" branch with "master" branch
......@@ -212,6 +212,13 @@ kraken2:
# GTDBTK:
# DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
##############################
# MISC
# https://github.com/marbl/mash
mash:
threads: 10
##############################
# Binning
......
......@@ -3,7 +3,7 @@
# Pipeline steps to be done
# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps: ["taxonomy"]
steps: ["analysis"]
############################################################
# INPUT
......@@ -212,6 +212,13 @@ kraken2:
# GTDBTK:
# DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
##############################
# MISC
# https://github.com/marbl/mash
mash:
threads: 10
##############################
# Binning
......
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=1_llvm
- capnproto=0.6.1=hfc679d8_1
- gsl=2.6=h294904e_0
- libblas=3.8.0=17_openblas
- libcblas=3.8.0=17_openblas
- libgcc-ng=9.2.0=h24d8f2e_2
- libgfortran-ng=7.5.0=hdf63c60_6
- libopenblas=0.3.10=h5ec1e0e_0
- libstdcxx-ng=9.2.0=hdf63c60_2
- llvm-openmp=10.0.0=hc9558a2_0
- mash=2.2.2=ha61e061_2
- zlib=1.2.11=h516909a_1006
......@@ -168,6 +168,72 @@ rule circ_contigs_filter:
df_circ.to_csv(output.circ, sep="\t", header=True, index=False, index_label=False)
df_compl.to_csv(output.compl, sep="\t", header=True, index=False, index_label=False)
##################################################
# Mash
rule mash_contigs_process:
input:
os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.fasta")
output:
temp(os.path.join(RESULTS_DIR, "analysis/mash/{rtype}_{tool}.fasta"))
wildcard_constraints:
rtype="|".join(READ_TYPES),
tool="|".join(ASSEMBLERS)
params:
minlen=1000
conda:
os.path.join(ENV_DIR, "rgi.yaml") # NOTE: need an env. containing Biopython
message:
"Process contigs for mash"
script:
os.path.join(SRC_DIR, "mash_contigs.py")
rule mash_contigs:
input:
lr=expand(os.path.join(RESULTS_DIR, "analysis/mash/lr_{tool}.fasta"), tool=config["assemblers"]["lr"]),
sr=expand(os.path.join(RESULTS_DIR, "analysis/mash/sr_{tool}.fasta"), tool=config["assemblers"]["sr"]),
hy=expand(os.path.join(RESULTS_DIR, "analysis/mash/hy_{tool}.fasta"), tool=config["assemblers"]["hy"])
output:
os.path.join(RESULTS_DIR, "analysis/mash/contigs.fasta")
message:
"Cat contigs for mash"
shell:
"cat {input} > {output}"
rule mash_sketch:
input:
os.path.join(RESULTS_DIR, "analysis/mash/contigs.fasta")
output:
os.path.join(RESULTS_DIR, "analysis/mash/contigs.msh")
log:
out="logs/mash.sketch.contigs.out.log",
err="logs/mash.sketch.contigs.err.log"
threads:
1
conda:
os.path.join(ENV_DIR, "mash.yaml")
shell:
"(date && "
"ofile={output} && mash sketch -i -k 31 -s 1000 -S 42 -o ${{ofile%.*}} {input} && "
"date) 2> {log.err} > {log.out}"
rule mash_dist:
input:
os.path.join(RESULTS_DIR, "analysis/mash/contigs.msh")
output:
os.path.join(RESULTS_DIR, "analysis/mash/contigs.dist")
log:
out="logs/mash.dist.contigs.out.log",
err="logs/mash.dist.contigs.err.log"
threads:
config["mash"]["threads"]
conda:
os.path.join(ENV_DIR, "mash.yaml")
shell:
"(date && "
"mash dist -p {threads} {input} {input} > {output} && "
"date) 2> {log.err} > {log.out}"
##################################################
# MMseqs2
......
#!/usr/bin/env python
from Bio import SeqIO
with open(snakemake.input[0], "r") as ifile, open(snakemake.output[0], "w") as ofile:
for record in SeqIO.parse(ifile, "fasta"):
if len(record) >= snakemake.params.minlen:
record.id = "%s__%s__%s" % (record.id, snakemake.wildcards.rtype, snakemake.wildcards.tool)
SeqIO.write(record, ofile, "fasta")
......@@ -10,7 +10,8 @@ rule ANALYSIS:
input:
"status/analysis_assembly.done",
"status/analysis_proteins.done",
"status/analysis_circ.done"
"status/analysis_circ.done",
"status/analysis_mash.done"
output:
touch("status/analysis.done")
......@@ -44,4 +45,10 @@ rule ANALYSIS_CIRC:
ctype=["circ", "compl"]
)
output:
touch("status/analysis_circ.done")
\ No newline at end of file
touch("status/analysis_circ.done")
rule ANALYSIS_MASH:
input:
os.path.join(RESULTS_DIR, "analysis/mash/contigs.dist")
output:
touch("status/analysis_mash.done")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment