Skip to content
Snippets Groups Projects

Add zymo workflow

Merged Valentina Galata requested to merge zymo into bright_future
1 file
+ 42
104
Compare changes
  • Side-by-side
  • Inline
+ 42
104
@@ -30,16 +30,6 @@ rule all:
os.path.join(RESULTS_DIR, "extra/{rtype_tool}.genes.tsv"),
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
)
# expand(os.path.join(RESULTS_DIR, "analysis/headers/{combi}_headers.txt"),
# combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS] +
# ["%s_%s__%s_%s" % (p[1][0], p[1][1], p[0][0], p[0][1]) for p in READ_ASSEMBLER_PAIRS]),
# expand(os.path.join(RESULTS_DIR, "analysis/headers/{rtype_tool}_zymo_headers.txt"),
# rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]),
# # expand(os.path.join(RESULTS_DIR, "analysis/comparison/common_{combi}.txt"),
# # combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS] +
# # ["%s_%s__%s_%s" % (p[1][0], p[1][1], p[0][0], p[0][1]) for p in READ_ASSEMBLER_PAIRS]),
# expand(os.path.join(RESULTS_DIR, "analysis/comparison/{rtype_tool}_comparison.txt"),
# rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]),
# expand(os.path.join(RESULTS_DIR, "analysis/sibeliaz/{combi}_output/blocks_coords.gff"),
# combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS]),
# expand(os.path.join(RESULTS_DIR, "analysis/zymo_sibeliaz/zymo_{rtype_tool}_output/blocks_coords.gff"),
@@ -219,132 +209,80 @@ rule assembly_gene_summary:
df.set_index(["seq_id"], drop=True, inplace=True, verify_integrity=True)
return df
def parse_genecov_table(ifile):
# TODO: move to utils.py
import pandas
df = pandas.read_csv(ifile, sep="\t", header=0)
df.set_index(["prot_id"], drop=True, inplace=True, verify_integrity=True)
return df
# contig IDs and their length
contigs = parse_assembly_fna(input.contigs)
print(contigs.head())
# print(contigs.head())
# genes/proteins
genes = parse_prodigal_faa_headers(input.proteins)
genes = genes.assign(partial=genes["info"].apply(lambda x: not re.search(";partial=00;", x)))
print(genes.head())
# print(genes.head())
# gene cov
genecov = {}
for ifile in input.genecov:
mtype = os.path.basename(os.path.dirname(os.path.dirname(ifile)))
genecov[mtype] = pandas.read_csv(ifile, sep="\t", header=0)[["prot_id", "ave_cov"]]
print(genecov[mtype].head())
mtype = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(ifile))))
genecov[mtype] = parse_genecov_table(ifile)[["ave_cov"]]
# print(genecov[mtype].head())
# kraken2
kraken2 = {}
for ifile in input.kraken2:
db = os.path.basename(ifile).split(".")[2]
kraken2[db] = parse_kraken2_labels(ifile)
print(kraken2[db].head())
# print(kraken2[db].head())
# kaiju
kaiju = {}
for ifile in input.kaiju:
db = os.path.basename(ifile).split(".")[2]
kaiju[db] = parse_kaiju_labels(ifile)
print(kaiju[db].head())
# print(kaiju[db].head())
# hits to ref. genomes
refhits = parse_diamond_tsv(input.refhits)
print(refhits.head())
refhits.set_index(["qseqid"], drop=True, inplace=True, verify_integrity=True)
# print(refhits.head())
# unique prot.s from cdhit
unique = {}
for ifile in input.cdhit:
tool2 = os.path.basename(ifile).split("__")[0].split("_")[1]
unique[tool2] = list(parse_prodigal_faa_headers(ifile)["prot_id"].apply(lambda x: "__".join(x.split("__")[2:])))
print("cdhit: {}: {}: {}".format(tool2, len(unique[tool2]), unique[tool2][0:5]))
# print("cdhit: {}: {}: {}".format(tool2, len(unique[tool2]), unique[tool2][0:5]))
# matched prots. from diamond hits
matched = {}
for ifile in input.diamond:
tool2 = os.path.basename(ifile).split("__")[0].split("_")[1]
matched[tool2] = parse_diamond_tsv(ifile)
print(matched[tool2].head())
matched[tool2].set_index(["qseqid"], drop=True, inplace=True, verify_integrity=True)
# print(matched[tool2].head())
# summary
# TODO
summary = genes.copy()
summary.set_index(["prot_id"], drop=False, inplace=True, verify_integrity=True)
summary["contig_length"] = summary["contig_id"].apply(lambda x: contigs.loc[x,"length"])
for k, v in genecov.items():
summary["ave_cov_%s" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"ave_cov"])
for k, v in kraken2.items():
summary["kraken2_%s_name" % k] = summary["contig_id"].apply(lambda x: v.loc[x,"tax_name"])
summary["kraken2_%s_id" % k] = summary["contig_id"].apply(lambda x: v.loc[x,"tax_id"])
for k, v in kaiju.items():
summary["kaiju_%s_lineage" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"tax_lineage"])
summary["kaiju_%s_id" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"tax_id"])
summary["ref_sseqid"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"sseqid"] if x in refhits.index else None)
summary["ref_qslratio"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"qs_len_ratio"] if x in refhits.index else None)
summary["ref_qcov"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"qcov"] if x in refhits.index else None)
summary["ref_scov"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"scov"] if x in refhits.index else None)
for k, v in unique.items():
summary["cdhit_unique_%s" % k] = False
summary.loc[v,"cdhit_unique_%s" % k] = True
for k, v in matched.items():
summary["diamond_unique_%s" % k] = True
summary.loc[v.index,"diamond_unique_%s" % k] = False
# print(summary.head())
# save
# summary.to_csv(output[0], sep="\t", columns=True, index=True, index_label="protein_id")
###################################################################
# rule get_headers:
# input:
# os.path.join(DATA_DIR, "analysis/cdhit/{rtype1}_{tool1}__{rtype2}_{tool2}.faa")
# output:
# os.path.join(RESULTS_DIR,"analysis/headers/{rtype1}_{tool1}__{rtype2}_{tool2}_headers.txt")
# log:
# out="logs/headers.{rtype1}.{tool1}.{rtype2}.{tool2}.out.log",
# err="logs/headers.{rtype1}.{tool1}.{rtype2}.{tool2}.err.log"
# wildcard_constraints:
# rtype1="|".join(READ_TYPES),
# rtype2="|".join(READ_TYPES),
# tool1="|".join(ASSEMBLERS),
# tool2="|".join(ASSEMBLERS)
# shell:
# "(date && "
# "grep '>' {input} | sed 's/^.*\_\_//' | sed 's/\ .*//' > {output} && "
# "date) 2> {log.err} > {log.out}"
# rule zymo_headers:
# input:
# os.path.join(RESULTS_DIR, "analysis/diamond/{rtype}_{tool}.tsv")
# output:
# os.path.join(RESULTS_DIR,"analysis/headers/{rtype}_{tool}_zymo_headers.txt")
# log:
# out="logs/zymo_headers.{rtype}.{tool}.out.log",
# err="logs/zymo_headers.{rtype}.{tool}.err.log"
# wildcard_constraints:
# rtype="|".join(READ_TYPES),
# tool="|".join(ASSEMBLERS)
# shell:
# "(date && "
# "awk '{{print $1}}' {input} > {output} && "
# "date) 2> {log.err} > {log.out}"
# rule comparison:
# input:
# faa=os.path.join(DATA_DIR, "annotation/prodigal/{rtype2}/{tool2}/proteins.faa"),
# tsv=os.path.join(RESULTS_DIR, "analysis/diamond/{rtype2}_{tool2}.tsv"),
# cdhit=lambda wildcards: expand(
# os.path.join(DATA_DIR, "analysis/cdhit/{rtype1_tool1}__{{rtype2}}_{{tool2}}.faa"),
# rtype1_tool1=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS if rtype != wildcards.rtype2 and tool != wildcards.tool2])
# output:
# os.path.join(RESULTS_DIR, "analysis/comparison/{rtype2}_{tool2}_comparison.txt")
# wildcard_constraints:
# rtype1="|".join(READ_TYPES),
# rtype2="|".join(READ_TYPES),
# tool1="|".join(ASSEMBLERS),
# tool2="|".join(ASSEMBLERS)
# run:
# # getting protein ids
# pids = []
# with open(input.faa,'r') as fi:
# for ln in fi:
# if ln.startswith('>'):
# pids.append(re.sub("^>", "", ln.strip().split(" ")[0]))
# # creating empty dataframe
# df=pandas.DataFrame(False, index=pids, columns=["ref"] + [os.path.basename(fname).split("__")[0] for fname in input.cdhit])
# # parsing diamond hits
# ref_hits=pandas.read_csv(input.tsv, sep='\t', header=None)
# ref_hits.columns=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']
# df.loc[ref_hits["qseqid"],"ref"] = True
# # parse cdhit faa files
# for fname in input.cdhit:
# fname_col=os.path.basename(fname).split("__")[0] # column name to be used
# fname_pids=[]
# with open(fname,'r') as fi:
# for ln in fi:
# if ln.startswith('>'):
# nam=re.sub("^>", "", ln.strip().split(" ")[0])
# fname_pids.append(nam.strip().split("__")[2])
# df.loc[fname_pids,fname_col] = True
# # save
# df.to_csv(output[0], sep='\t', header=0)
summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False, na_rep="NA")
# #####################
# # Genome comparison #
Loading