Skip to content
Snippets Groups Projects
Commit 2578f3d8 authored by Susheel Busi's avatar Susheel Busi
Browse files

Rules for tables to compare different assemblers and their mapping to zymo

parent 3294ef44
No related branches found
No related tags found
2 merge requests!78Add zymo workflow,!77Workflow: Zymo - Comparison to Reference Genomes
......@@ -2,6 +2,7 @@
work_dir: "/scratch/users/sbusi/zymo"
results_dir: "results"
data_dir: "/scratch/users/sbusi/zymo/Zymo/results"
ref_genomes_dir: "/scratch/users/sbusi/zymo/results/data_ref_genomes"
# Assembly
# List of assemblers for different read types: assembler names MUST be UNIQUE
......@@ -14,7 +15,6 @@ assemblers:
# Additional params
single_fast5_dir: "data_single_fast5"
multi_fast5_dir: "data_multi_fast5"
ref_genomes_dir: "data_ref_genomes"
# Annotation tools
prodigal:
......
......@@ -13,7 +13,7 @@ download_genomes:
runtime: "0-00:15:00"
concatenate:
partition: "interactive"
partition: "batch"
quality: "normal"
runtime: "0-00:05:00"
......
......@@ -3,6 +3,7 @@
##################################################
import os
import re
import pandas as pd
##############################
# CONFIG
......@@ -70,7 +71,9 @@ rule all:
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]),
expand(os.path.join(RESULTS_DIR, "analysis/comparison/common_{combi}.txt"),
combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS] +
["%s_%s__%s_%s" % (p[1][0], p[1][1], p[0][0], p[0][1]) for p in READ_ASSEMBLER_PAIRS])
["%s_%s__%s_%s" % (p[1][0], p[1][1], p[0][0], p[0][1]) for p in READ_ASSEMBLER_PAIRS]),
expand(os.path.join(RESULTS_DIR, "analysis/comparison/{rtype_tool}_comparison.txt"),
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS])
rule download_genomes:
......@@ -203,25 +206,44 @@ rule zymo_headers:
rule comparison:
input:
in1=os.path.join(RESULTS_DIR,"analysis/headers/{rtype1}_{tool1}__{rtype2}_{tool2}_headers.txt"),
in2=os.path.join(RESULTS_DIR,"analysis/headers/{rtype2}_{tool2}__{rtype1}_{tool1}_headers.txt")
faa=os.path.join(DATA_DIR, "annotation/prodigal/{rtype2}/{tool2}/proteins.faa"),
tsv=os.path.join(RESULTS_DIR, "analysis/diamond/{rtype2}_{tool2}.tsv"),
cdhit=lambda wildcards: expand(
os.path.join(DATA_DIR, "analysis/cdhit/{rtype1_tool1}__{{rtype2}}_{{tool2}}.faa"),
rtype1_tool1=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS if rtype != wildcards.rtype2 and tool != wildcards.tool2])
output:
out1=os.path.join(RESULTS_DIR, "analysis/comparison/common_{rtype1}_{tool1}__{rtype2}_{tool2}.txt"),
out2=os.path.join(RESULTS_DIR, "analysis/comparison/{rtype1}_{tool1}_uniq__{rtype2}_{tool2}.txt"),
out3=os.path.join(RESULTS_DIR, "analysis/comparison/{rtype2}_{tool2}_uniq__{rtype1}_{tool1}.txt")
log:
out="logs/comparison.{rtype1}.{tool1}.{rtype2}.{tool2}.out.log",
err="logs/comparison.{rtype1}.{tool1}.{rtype2}.{tool2}.err.log"
os.path.join(RESULTS_DIR, "analysis/comparison/{rtype2}_{tool2}_comparison.txt")
wildcard_constraints:
rtype1="|".join(READ_TYPES),
rtype2="|".join(READ_TYPES),
tool1="|".join(ASSEMBLERS),
tool2="|".join(ASSEMBLERS)
shell:
"""
(date &&\
comm -12 <(sort {input.in1} | uniq) <(sort {input.in2} | uniq) > {output.out1} &&\
comm -23 <(sort {input.in1} | uniq) <(sort {input.in2} | uniq) > {output.out2} &&\
comm -13 <(sort {input.in1} | uniq) <(sort {input.in2} | uniq) > {output.out3} &&\
date) &> >(tee {log})
"""
\ No newline at end of file
tool2="|".join(ASSEMBLERS)
run:
# getting protein ids
pids = []
with open(input.faa,'r') as fi:
for ln in fi:
if ln.startswith('>'):
pids.append(re.sub("^>", "", ln.strip().split(" ")[0]))
# creating empty dataframe
df=pd.DataFrame(False, index=pids, columns=["ref"] + [os.path.basename(fname).split("__")[0] for fname in input.cdhit])
# parsing diamond hits
ref_hits=pd.read_csv(input.tsv, sep='\t', header=None)
ref_hits.columns=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']
df.loc[ref_hits["qseqid"],"ref"] = True
# parse cdhit faa files
for fname in input.cdhit:
fname_col=os.path.basename(fname).split("__")[0] # column name to be used
fname_pids=[]
with open(input.cdhit,'r') as fi:
for ln in fi:
if ln.startswith('>'):
fname_pids.append(re.sub("^>", "", ln.strip().split(" ")[0]))
df.loc[fname_pids,fname_col] = True
# save
df.to_csv(output[0], sep='\t', header=0)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment