Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
ONT_pilot_gitlab
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
External wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Contributor analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ESB
ONT_pilot_gitlab
Merge requests
!78
Add zymo workflow
Code
Review changes
Check out branch
Download
Patches
Plain diff
Merged
Add zymo workflow
zymo
into
bright_future
Overview
0
Commits
39
Pipelines
0
Changes
1
Merged
Valentina Galata
requested to merge
zymo
into
bright_future
4 years ago
Overview
0
Commits
39
Pipelines
0
Changes
1
Expand
0
0
Merge request reports
Viewing commit
1b38a514
Prev
Next
Show latest version
1 file
+
42
−
104
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
1b38a514
zymo: updated snakemake (gene summary, issue
#111
)
· 1b38a514
Valentina Galata
authored
4 years ago
workflow_zymo/Snakefile
0 → 100644
+
530
−
0
Options
# Pipeline for extra analysis of the zymo data:
# to be used after the main workflow (i.e. workflow/) and the report workflow (i.e. workflow_report/)
#
# Example call:
# snakemake -s workflow_zymo/Snakefile --configfile config/zymo/config.yaml --use-conda --conda-prefix /scratch/users/vgalata/miniconda3/ONT_pilot --cores 1 -rpn
##############################
# MODULES
import os
import re
import pandas
from scripts.utils import assembler_pairs
##############################
# CONFIG
include:
"rules/init.smk"
##################################################
# RULES
##################################################
rule all:
input:
refs=os.path.join(RESULTS_DIR, "ref_genomes", "zymo.dmnd"),
kegg=os.path.join(DB_DIR, "kegg.tsv"),
mummer=expand(
os.path.join(RESULTS_DIR, "extra/zymo/mummer/{rtype_tool}.dnadiff.report"),
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
),
diamond=expand(
os.path.join(RESULTS_DIR, "extra/zymo/diamond/{rtype_tool}.tsv"),
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
),
summary=expand(
os.path.join(RESULTS_DIR, "extra/{rtype_tool}.genes.tsv"),
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
),
summary_pdf=expand(
os.path.join(RESULTS_DIR, "extra/{rtype_tool}.genes.pdf"),
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
)
#####
# expand(os.path.join(RESULTS_DIR, "analysis/sibeliaz/{combi}_output/blocks_coords.gff"),
# combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS]),
# expand(os.path.join(RESULTS_DIR, "analysis/zymo_sibeliaz/zymo_{rtype_tool}_output/blocks_coords.gff"),
# rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS])
rule zymo_download:
output:
os.path.join(RESULTS_DIR, "ref_genomes", "{org}.fna")
log:
"logs/zymo.download.{org}.log"
params:
url=lambda wildcards: config["ref_genomes"][wildcards.org]
message:
"Zymo: download reference: {wildcards.org}"
shell:
"(date && wget {params.url} -O {output} && date) &> {log}"
rule zymo_prodigal:
input:
rules.zymo_download.output
output:
faa=os.path.join(RESULTS_DIR, "ref_genomes", "{org}.faa"),
gbk=os.path.join(RESULTS_DIR, "ref_genomes", "{org}.gbk")
log:
"logs/zymo.prodigal.{org}.log"
conda:
os.path.join(ENV_DIR, "prodigal.yaml")
message:
"Zymo: predict proteins: {wildcards.org}"
shell:
"(date && prodigal -i {input} -o {output.gbk} -a {output.faa} && sed -i 's/^>/>{wildcards.org}__/' {output.faa} && date) &> {log}"
rule zymo_concat_fna:
input:
expand(os.path.join(RESULTS_DIR, "ref_genomes", "{org}.fna"), org=config["ref_genomes"].keys())
output:
os.path.join(RESULTS_DIR, "ref_genomes", "zymo.fna")
message:
"Zymo: concat references"
shell:
"for f in {input}; do sed \"s/^>/>$(basename -s '.fna' ${{f}})__/\" ${{f}}; done > {output}"
rule zymo_concat_faa:
input:
expand(os.path.join(RESULTS_DIR, "ref_genomes", "{org}.faa"), org=config["ref_genomes"].keys())
output:
os.path.join(RESULTS_DIR, "ref_genomes", "zymo.faa")
message:
"Zymo: concat reference proteins"
shell:
"cat {input} > {output}"
rule zymo_diamondDB:
input:
rules.zymo_concat_faa.output
output:
os.path.join(RESULTS_DIR, "ref_genomes", "zymo.dmnd")
log:
"logs/zymo.diamondDB.log"
threads:
config["diamond"]["threads"]
conda:
os.path.join(ENV_DIR, "diamond.yaml")
message:
"Zymo: reference DIAMOND DB"
shell:
"(date && db={output} && diamond makedb --in {input} --db ${{db%.*}} --threads {threads} && date) &> {log}"
rule zymo_diamond:
input:
faa=os.path.join(RESULTS_DIR, "annotation/prodigal/{rtype}/{tool}/proteins.faa"),
db=rules.zymo_diamondDB.output
output:
daa=os.path.join(RESULTS_DIR, "extra/zymo/diamond/{rtype}_{tool}.daa"),
tsv=os.path.join(RESULTS_DIR, "extra/zymo/diamond/{rtype}_{tool}.tsv")
log:
"logs/zymo.diamond.{rtype}.{tool}.log",
wildcard_constraints:
rtype="|".join(READ_TYPES),
tool="|".join(ASSEMBLERS)
threads:
config["diamond"]["threads"]
params:
blastp="--id 50 --query-cover 50 --subject-cover 50 --more-sensitive",
outfmt="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
conda:
os.path.join(ENV_DIR, "diamond.yaml")
message:
"Zymo: DIAMOND search: {input}"
shell:
"(date && "
"db={input.db} && "
"daa={output.daa} && "
"diamond blastp -q {input.faa} --db ${{db%.*}} --out ${{daa}} --threads {threads} --outfmt 100 {params.blastp} && "
# --max-target-seqs 1: best N matches that pass the e-value treshold
"diamond view --daa ${{daa%.*}} --max-target-seqs 1 --threads {threads} --outfmt {params.outfmt} --out {output.tsv} && "
"date) &> {log}"
rule zymo_mummer_dnadiff:
input:
ref=rules.zymo_concat_fna.output,
asm=os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.POLISHED.fasta")
output:
out=os.path.join(RESULTS_DIR, "extra/zymo/mummer/{rtype}_{tool}.dnadiff.report"),
log:
"logs/zymo.mummer.dnadiff.{rtype}.{tool}.log"
wildcard_constraints:
rtype="|".join(READ_TYPES),
tool="|".join(ASSEMBLERS)
threads: 1
conda:
os.path.join(ENV_DIR, "mummer.yaml")
message:
"Zymo: MUMMER: dnadiff: {input}"
shell:
"(date && "
"pout={output.out} && "
"dnadiff -p ${{pout%.*}} {input.ref} {input.asm} && "
"date) &> {log}"
rule proc_hmmer_tblout:
input:
os.path.join(RESULTS_DIR, "annotation/hmm/kegg/{rtype}/{tool}/hmm.tblout")
output:
temp(os.path.join(RESULTS_DIR, "annotation/hmm/kegg/{rtype}/{tool}/hmm.tsv"))
shell:
"grep -v '^#' {input} | sed 's/\s\+/\\t/g' | cut -f1-18 > {output}"
rule kegg_ko_ids:
input:
os.path.join(DB_DIR, config["hmm"]["kegg"])
output:
os.path.join(DB_DIR, "kegg.ids")
shell:
"grep \"^NAME\" {input} | sed 's/^NAME\s\+//' > {output}"
rule kegg_api_ko:
input:
os.path.join(DB_DIR, "kegg.ids")
output:
os.path.join(DB_DIR, "kegg.tsv")
run:
def query_ko(kos, concat=False):
# TODO: move to utils.py
# https://www.kegg.jp/kegg/rest/keggapi.html
# GET: The input is limited up to 10 entries
import re
import requests
from time import sleep
from random import random
assert len(kos) <= 10 and len(kos) > 0, "Number of KOs must be between 1 and 10, i.e. not {}".format(len(KO))
for ko in kos:
assert re.match("K", ko), f"KO does not start with a K: {ko}"
kegg_info = dict()
for ko in kos:
kegg_info[ko] = {"kegg_acc": ko, "kegg_name": "NA", "kegg_def": "NA", "kegg_pathways": set()}
s = requests.Session()
s_get = s.get("http://rest.kegg.jp/get/{kos}".format(kos="+".join(kos)))
assert s_get.status_code == 200, f"Could not get data"
s_pws = s.get("http://rest.kegg.jp/link/pathway/{kos}".format(kos="+".join(kos)))
assert s_pws.status_code == 200, f"Could not get linked pathways"
ko = None
for line in s_get.text.splitlines():
line = line.strip()
# if re.match("///", line): # new entry
# ko = None
if re.match("ENTRY", line):
ko = re.sub("\s+","\t", line).split("\t")[1]
assert ko in kegg_info
if re.match("NAME", line):
assert kegg_info[ko]["kegg_name"] == "NA"
kegg_info[ko]["kegg_name"] = re.sub("^NAME\s+", "", line)
elif re.match("DEFINITION", line):
assert kegg_info[ko]["kegg_def"] == "NA"
kegg_info[ko]["kegg_def"] = re.sub("^DEFINITION\s+", "", line)
for line in s_pws.text.split("\n"):
if line and re.search("path:map", line):
ko, pathway = line.split("\t")
ko = re.sub("^ko:","",ko)
assert ko in kegg_info
kegg_info[ko]["kegg_pathways"].add(pathway)
if concat:
for ko in kos:
kegg_info[ko]["kegg_pathways"] = ";".join(list(kegg_info[ko]["kegg_pathways"])) if len(kegg_info[ko]["kegg_pathways"]) > 0 else "NA"
# sleep for at least 1 sec
sleep(1 + random())
return kegg_info
with open(input[0], "r") as ifile, open(output[0], "w") as ofile:
header = ["kegg_acc", "kegg_name", "kegg_def", "kegg_pathways"]
ofile.write("\t".join(header) + "\n")
kos = []
for line in ifile:
if len(kos) == 10:
q = query_ko(kos=kos, concat=True)
for v in q.values():
ofile.write("\t".join([v[k] for k in header]) + "\n"); ofile.flush()
kos = []
ko = line.strip().split("_")[0]
kos.append(ko)
if len(kos) > 0:
q = query_ko(kos=kos, concat=True)
for v in q.values():
ofile.write("\t".join([v[k] for k in header]) + "\n"); ofile.flush()
rule assembly_gene_summary:
input:
# FASTAs
contigs=os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.POLISHED.fasta"),
proteins=os.path.join(RESULTS_DIR, "annotation/prodigal/{rtype}/{tool}/proteins.faa"),
# taxonomy
kraken2=expand(
os.path.join(RESULTS_DIR, "taxonomy/kraken2/{{rtype}}.{{tool}}.{db}.labels.txt"),
db=config["kraken2"]["db"].keys()
),
kaiju=expand(
os.path.join(RESULTS_DIR, "taxonomy/kaiju/{{rtype}}.{{tool}}.{db}.tsv"),
db=config["kaiju"]["db"].keys()
),
# coverage
genecov=expand(
os.path.join(RESULTS_DIR, "mapping/{mtype}/{{rtype}}/{{tool}}/ASSEMBLY.POLISHED.sr.cov.pergene"),
mtype=META_TYPES
),
# reference
refhits=os.path.join(RESULTS_DIR, "extra/zymo/diamond/{rtype}_{tool}.tsv"),
# unique/matches proteins
cdhit=lambda wildcards: expand(
os.path.join(RESULTS_DIR, "analysis/cdhit/{rtype1_tool1}__{{rtype}}_{{tool}}.faa"),
rtype1_tool1=["%s_%s" % (r,t) for r,t in READ_ASSEMBLERS if r != wildcards.rtype or t != wildcards.tool]
),
diamond=lambda wildcards: expand(
os.path.join(RESULTS_DIR, "analysis/diamond/{rtype1_tool1}__{{rtype}}_{{tool}}.tsv"),
rtype1_tool1=["%s_%s" % (r,t) for r,t in READ_ASSEMBLERS if r != wildcards.rtype or t != wildcards.tool]
),
kegg=os.path.join(RESULTS_DIR, "annotation/hmm/kegg/{rtype}/{tool}/hmm.tsv"),
kegg_info=os.path.join(DB_DIR, "kegg.tsv"),
output:
os.path.join(RESULTS_DIR, "extra/{rtype}_{tool}.genes.tsv")
wildcard_constraints:
rtype="|".join(READ_TYPES),
tool="|".join(ASSEMBLERS)
message:
"Zymo: Gene summary: {wildcards.rtype}, {wildcards.tool}"
run:
from scripts.utils import parse_diamond_tsv, parse_prodigal_faa_headers
def parse_assembly_fna(ifile):
# TODO: move to utils.py
import re
import pandas
contigs = {}
with open(ifile, "r") as ifile:
cid = None
for line in ifile:
line = line.strip()
if re.match(">", line):
cid = re.sub("^>", "", line.split(" ")[0])
assert cid not in contigs
contigs[cid] = {"length": 0}
else:
contigs[cid]["length"] += len(line)
contigs = pandas.DataFrame.from_dict(contigs, orient='index')
return contigs
def parse_kaiju_labels(ifile, columns=["class_flag", "seq_id", "tax_id", "best_match_length", "best_match_tax_ids", "best_match_accs", "best_match_seq", "tax_lineage"]):
# TODO: move to utils.py
# https://github.com/bioinformatics-centre/kaiju#output-format
# either C or U, indicating whether the read is classified or unclassified.
# name of the read
# NCBI taxon identifier of the assigned taxon
# the length or score of the best match used for classification
# the taxon identifiers of all database sequences with the best match
# the accession numbers of all database sequences with the best match
# matching fragment sequence(s)
import pandas
df = pandas.read_csv(ifile, sep="\t", header=None, names=columns)
df = df[["class_flag", "seq_id", "tax_id", "tax_lineage"]]
df.set_index(["seq_id"], drop=True, inplace=True, verify_integrity=True)
return df
def parse_kraken2_labels(ifile, columns=["class_flag", "seq_id", "taxon", "seq_length", "matches"]):
# TODO: move to utils.py
# https://github.com/DerrickWood/kraken2/wiki/Manual#standard-kraken-output-format
# "C"/"U": a one letter code indicating that the sequence was either classified or unclassified.
# The sequence ID, obtained from the FASTA/FASTQ header.
# The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence is unclassified.
# The length of the sequence in bp.
# A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s).
import pandas
df = pandas.read_csv(ifile, sep="\t", header=None, names=columns)
df = df.assign(tax_name=df["taxon"].apply(lambda x: re.search("(?P<tname>^.*) \(taxid (?P<tid>[0-9]+)\)$", x).group("tname")))
df = df.assign(tax_id=df["taxon"].apply(lambda x: re.search("(?P<tname>^.*) \(taxid (?P<tid>[0-9]+)\)$", x).group("tid")))
df.drop(columns=["matches", "taxon"], inplace=True)
df.set_index(["seq_id"], drop=True, inplace=True, verify_integrity=True)
return df
def parse_genecov_table(ifile):
# TODO: move to utils.py
import pandas
df = pandas.read_csv(ifile, sep="\t", header=0)
df.set_index(["prot_id"], drop=True, inplace=True, verify_integrity=True)
return df
def parse_hmmer_tblout(ifile, columns=["tname","tacc","qname","qacc","fs_evalue","fs_score","fs_bias","b1d_evalue","b1d_score","b1d_bias","exp","reg","clu","ov","env","dom","rep","inc"]):
# TODO: move to utils.py
import pandas
df = pandas.read_csv(ifile, sep="\t", header=None, names=columns)
df.drop(columns=["tacc","qacc","exp","reg","clu","ov","env","dom","rep","inc"], inplace=True)
# best hit per target
df = df.sort_values(by=["tname", "b1d_evalue"], axis=0, ascending=[True, True]).groupby("tname").head(1).reset_index(drop=True)
df.set_index(["tname"], drop=True, inplace=True, verify_integrity=True)
return df
# contig IDs and their length
contigs = parse_assembly_fna(input.contigs)
# print(contigs.head())
# genes/proteins
genes = parse_prodigal_faa_headers(input.proteins)
genes["gene_length"] = abs(genes["end"] - genes["start"]) + 1
genes = genes.assign(partial=genes["info"].apply(lambda x: not re.search(";partial=00;", x)))
# print(genes.head())
# gene cov
genecov = {}
for ifile in input.genecov:
mtype = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(ifile))))
genecov[mtype] = parse_genecov_table(ifile)[["ave_cov"]]
# print(genecov[mtype].head())
# kraken2
kraken2 = {}
for ifile in input.kraken2:
db = os.path.basename(ifile).split(".")[2]
kraken2[db] = parse_kraken2_labels(ifile)
# print(kraken2[db].head())
# kaiju
kaiju = {}
for ifile in input.kaiju:
db = os.path.basename(ifile).split(".")[2]
kaiju[db] = parse_kaiju_labels(ifile)
# print(kaiju[db].head())
# hits to ref. genomes
refhits = parse_diamond_tsv(input.refhits)
refhits.set_index(["qseqid"], drop=True, inplace=True, verify_integrity=True)
# print(refhits.head())
# unique prot.s from cdhit
unique = {}
for ifile in input.cdhit:
tool2 = os.path.basename(ifile).split("__")[0].split("_")[1]
unique[tool2] = list(parse_prodigal_faa_headers(ifile)["prot_id"].apply(lambda x: "__".join(x.split("__")[2:])))
# print("cdhit: {}: {}: {}".format(tool2, len(unique[tool2]), unique[tool2][0:5]))
# matched prots. from diamond hits
matched = {}
for ifile in input.diamond:
tool2 = os.path.basename(ifile).split("__")[0].split("_")[1]
matched[tool2] = parse_diamond_tsv(ifile)
matched[tool2].set_index(["qseqid"], drop=True, inplace=True, verify_integrity=True)
# print(matched[tool2].head())
# kegg
kegg = parse_hmmer_tblout(input.kegg)
kegg_info = pandas.read_csv(input.kegg_info, sep="\t", header=0, index_col=0)
# print(kegg.head())
# summary
summary = genes.copy()
summary.set_index(["prot_id"], drop=False, inplace=True, verify_integrity=True)
summary["contig_length"] = summary["contig_id"].apply(lambda x: contigs.loc[x,"length"])
for k, v in genecov.items():
summary["ave_cov_%s" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"ave_cov"])
for k, v in kraken2.items():
summary["kraken2_%s_name" % k] = summary["contig_id"].apply(lambda x: v.loc[x,"tax_name"])
summary["kraken2_%s_id" % k] = summary["contig_id"].apply(lambda x: v.loc[x,"tax_id"])
for k, v in kaiju.items():
summary["kaiju_%s_lineage" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"tax_lineage"])
summary["kaiju_%s_id" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"tax_id"])
summary["ref_sseqid"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"sseqid"] if x in refhits.index else None)
summary["ref_pident"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"pident"] if x in refhits.index else None)
summary["ref_qslratio"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"qs_len_ratio"] if x in refhits.index else None)
summary["ref_qcov"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"qcov"] if x in refhits.index else None)
summary["ref_scov"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"scov"] if x in refhits.index else None)
for k, v in unique.items():
summary["cdhit_unique_%s" % k] = False
summary.loc[v,"cdhit_unique_%s" % k] = True
for k, v in matched.items():
summary["diamond_unique_%s" % k] = True
summary.loc[v.index,"diamond_unique_%s" % k] = False
summary["kegg"] = summary["prot_id"].apply(lambda x: kegg.loc[x,"qname"] if x in kegg.index else None)
summary["kegg_evalue"] = summary["prot_id"].apply(lambda x: kegg.loc[x,"b1d_evalue"] if x in kegg.index else None)
summary["kegg_name"] = summary["kegg"].apply(lambda x: kegg_info.loc[x.split("_")[0],"kegg_name"] if x is not None else None)
summary["kegg_def"] = summary["kegg"].apply(lambda x: kegg_info.loc[x.split("_")[0],"kegg_def"] if x is not None else None)
summary["kegg_pathways"]= summary["kegg"].apply(lambda x: kegg_info.loc[x.split("_")[0],"kegg_pathways"] if x is not None else None)
# print(summary.head())
# save
summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False, na_rep="NA")
rule assembly_gene_summary_plots:
input:
genes=os.path.join(RESULTS_DIR, "extra/{rtype}_{tool}.genes.tsv")
output:
pdf=os.path.join(RESULTS_DIR, "extra/{rtype}_{tool}.genes.pdf")
log:
out=os.path.join(RESULTS_DIR, "extra/{rtype}_{tool}.genes.pdf.log")
params:
const=os.path.join(SRC_DIR, "const.R"),
utils=os.path.join(SRC_DIR, "utils.R")
conda:
os.path.join(ENV_DIR, "r.yaml")
script:
os.path.join(SRC_DIR, "genes.R")
# #####################
# # Genome comparison #
# #####################
# rule sibeliaz:
# input:
# asm1=os.path.join(DATA_DIR, "assembly/{rtype1}/{tool1}/ASSEMBLY.POLISHED.fasta"),
# asm2=os.path.join(DATA_DIR, "assembly/{rtype2}/{tool2}/ASSEMBLY.POLISHED.fasta")
# output:
# os.path.join(RESULTS_DIR, "analysis/sibeliaz/{rtype1}_{tool1}__{rtype2}_{tool2}_output/blocks_coords.gff")
# log:
# out="logs/sibeliaz.{rtype1}.{tool1}.{rtype2}.{tool2}.out.log",
# err="logs/sibeliaz.{rtype1}.{tool1}.{rtype2}.{tool2}.err.log"
# threads:
# config["sibeliaz"]["threads"]
# params:
# kmers=config["sibeliaz"]["kmer"],
# memory=config["sibeliaz"]["mem"]
# wildcard_constraints:
# rtype1="|".join(READ_TYPES),
# rtype2="|".join(READ_TYPES),
# tool1="|".join(ASSEMBLERS),
# tool2="|".join(ASSEMBLERS)
# conda:
# os.path.join(ENV_DIR, "sibeliaz.yaml")
# message:
# "Genome comparision: Sibeliaz: {input.asm1} {input.asm2}"
# shell:
# "(date && "
# "sibeliaz -n -t {threads} -k {params.kmers} -f {params.memory} -o $(dirname {output}) {input.asm1} {input.asm2} && "
# "date) 2> {log.err} > {log.out}"
# rule concatenate:
# input:
# expand(os.path.join(REF_GENOMES_DIR, "{org}.fna"), org=config["ref_genomes"].keys())
# output:
# os.path.join(RESULTS_DIR,"reference/mock_genomes.fasta")
# log:
# out="logs/concat.out.log",
# err="logs/concat.err.log"
# shell:
# "(date && "
# "cat {input} > {output} && "
# "date) 2> {log.err} > {log.out}"
# rule zymo_sibeliaz:
# input:
# zymo=rules.concatenate.output,
# asm2=os.path.join(DATA_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.POLISHED.fasta")
# output:
# os.path.join(RESULTS_DIR, "analysis/zymo_sibeliaz/zymo_{rtype}_{tool}_output/blocks_coords.gff")
# log:
# out="logs/sibeliaz.{rtype}.{tool}.out.log",
# err="logs/sibeliaz.{rtype}.{tool}.err.log"
# threads:
# config["sibeliaz"]["threads"]
# params:
# kmers=config["sibeliaz"]["kmer"],
# memory=config["sibeliaz"]["mem"]
# wildcard_constraints:
# rtype1="|".join(READ_TYPES),
# rtype2="|".join(READ_TYPES),
# tool1="|".join(ASSEMBLERS),
# tool2="|".join(ASSEMBLERS)
# conda:
# os.path.join(ENV_DIR, "sibeliaz.yaml")
# message:
# "Genome comparision: Sibeliaz: {input.zymo} {input.asm2}"
# shell:
# "(date && "
# "sibeliaz -n -t {threads} -k {params.kmers} -f {params.memory} -o $(dirname {output}) {input.zymo} {input.asm2} && "
# "date) 2> {log.err} > {log.out}"
\ No newline at end of file
Loading