Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
ONT_pilot_gitlab
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
External wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Contributor analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ESB
ONT_pilot_gitlab
Merge requests
!78
Add zymo workflow
Code
Review changes
Check out branch
Download
Patches
Plain diff
Merged
Add zymo workflow
zymo
into
bright_future
Overview
0
Commits
39
Pipelines
0
Changes
1
Merged
Valentina Galata
requested to merge
zymo
into
bright_future
4 years ago
Overview
0
Commits
39
Pipelines
0
Changes
1
Expand
0
0
Merge request reports
Viewing commit
1b38a514
Prev
Next
Show latest version
1 file
+
42
−
104
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
1b38a514
zymo: updated snakemake (gene summary, issue
#111
)
· 1b38a514
Valentina Galata
authored
4 years ago
workflow_zymo/Snakefile
+
42
−
104
Options
@@ -30,16 +30,6 @@ rule all:
os.path.join(RESULTS_DIR, "extra/{rtype_tool}.genes.tsv"),
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]
)
# expand(os.path.join(RESULTS_DIR, "analysis/headers/{combi}_headers.txt"),
# combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS] +
# ["%s_%s__%s_%s" % (p[1][0], p[1][1], p[0][0], p[0][1]) for p in READ_ASSEMBLER_PAIRS]),
# expand(os.path.join(RESULTS_DIR, "analysis/headers/{rtype_tool}_zymo_headers.txt"),
# rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]),
# # expand(os.path.join(RESULTS_DIR, "analysis/comparison/common_{combi}.txt"),
# # combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS] +
# # ["%s_%s__%s_%s" % (p[1][0], p[1][1], p[0][0], p[0][1]) for p in READ_ASSEMBLER_PAIRS]),
# expand(os.path.join(RESULTS_DIR, "analysis/comparison/{rtype_tool}_comparison.txt"),
# rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]),
# expand(os.path.join(RESULTS_DIR, "analysis/sibeliaz/{combi}_output/blocks_coords.gff"),
# combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS]),
# expand(os.path.join(RESULTS_DIR, "analysis/zymo_sibeliaz/zymo_{rtype_tool}_output/blocks_coords.gff"),
@@ -219,132 +209,80 @@ rule assembly_gene_summary:
df.set_index(["seq_id"], drop=True, inplace=True, verify_integrity=True)
return df
def parse_genecov_table(ifile):
# TODO: move to utils.py
import pandas
df = pandas.read_csv(ifile, sep="\t", header=0)
df.set_index(["prot_id"], drop=True, inplace=True, verify_integrity=True)
return df
# contig IDs and their length
contigs = parse_assembly_fna(input.contigs)
print(contigs.head())
#
print(contigs.head())
# genes/proteins
genes = parse_prodigal_faa_headers(input.proteins)
genes = genes.assign(partial=genes["info"].apply(lambda x: not re.search(";partial=00;", x)))
print(genes.head())
#
print(genes.head())
# gene cov
genecov = {}
for ifile in input.genecov:
mtype = os.path.basename(os.path.dirname(os.path.dirname(ifile)))
genecov[mtype] = pa
ndas.read_csv(ifile, sep="\t", header=0)[["prot_id",
"ave_cov"]]
print(genecov[mtype].head())
mtype = os.path.basename(os.path.dirname(os.path.dirname(
os.path.dirname(
ifile)))
)
genecov[mtype] = pa
rse_genecov_table(ifile)[[
"ave_cov"]]
#
print(genecov[mtype].head())
# kraken2
kraken2 = {}
for ifile in input.kraken2:
db = os.path.basename(ifile).split(".")[2]
kraken2[db] = parse_kraken2_labels(ifile)
print(kraken2[db].head())
#
print(kraken2[db].head())
# kaiju
kaiju = {}
for ifile in input.kaiju:
db = os.path.basename(ifile).split(".")[2]
kaiju[db] = parse_kaiju_labels(ifile)
print(kaiju[db].head())
#
print(kaiju[db].head())
# hits to ref. genomes
refhits = parse_diamond_tsv(input.refhits)
print(refhits.head())
refhits.set_index(["qseqid"], drop=True, inplace=True, verify_integrity=True)
# print(refhits.head())
# unique prot.s from cdhit
unique = {}
for ifile in input.cdhit:
tool2 = os.path.basename(ifile).split("__")[0].split("_")[1]
unique[tool2] = list(parse_prodigal_faa_headers(ifile)["prot_id"].apply(lambda x: "__".join(x.split("__")[2:])))
print("cdhit: {}: {}: {}".format(tool2, len(unique[tool2]), unique[tool2][0:5]))
#
print("cdhit: {}: {}: {}".format(tool2, len(unique[tool2]), unique[tool2][0:5]))
# matched prots. from diamond hits
matched = {}
for ifile in input.diamond:
tool2 = os.path.basename(ifile).split("__")[0].split("_")[1]
matched[tool2] = parse_diamond_tsv(ifile)
print(matched[tool2].head())
matched[tool2].set_index(["qseqid"], drop=True, inplace=True, verify_integrity=True)
# print(matched[tool2].head())
# summary
# TODO
summary = genes.copy()
summary.set_index(["prot_id"], drop=False, inplace=True, verify_integrity=True)
summary["contig_length"] = summary["contig_id"].apply(lambda x: contigs.loc[x,"length"])
for k, v in genecov.items():
summary["ave_cov_%s" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"ave_cov"])
for k, v in kraken2.items():
summary["kraken2_%s_name" % k] = summary["contig_id"].apply(lambda x: v.loc[x,"tax_name"])
summary["kraken2_%s_id" % k] = summary["contig_id"].apply(lambda x: v.loc[x,"tax_id"])
for k, v in kaiju.items():
summary["kaiju_%s_lineage" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"tax_lineage"])
summary["kaiju_%s_id" % k] = summary["prot_id"].apply(lambda x: v.loc[x,"tax_id"])
summary["ref_sseqid"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"sseqid"] if x in refhits.index else None)
summary["ref_qslratio"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"qs_len_ratio"] if x in refhits.index else None)
summary["ref_qcov"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"qcov"] if x in refhits.index else None)
summary["ref_scov"] = summary["prot_id"].apply(lambda x: refhits.loc[x,"scov"] if x in refhits.index else None)
for k, v in unique.items():
summary["cdhit_unique_%s" % k] = False
summary.loc[v,"cdhit_unique_%s" % k] = True
for k, v in matched.items():
summary["diamond_unique_%s" % k] = True
summary.loc[v.index,"diamond_unique_%s" % k] = False
# print(summary.head())
# save
# summary.to_csv(output[0], sep="\t", columns=True, index=True, index_label="protein_id")
###################################################################
# rule get_headers:
# input:
# os.path.join(DATA_DIR, "analysis/cdhit/{rtype1}_{tool1}__{rtype2}_{tool2}.faa")
# output:
# os.path.join(RESULTS_DIR,"analysis/headers/{rtype1}_{tool1}__{rtype2}_{tool2}_headers.txt")
# log:
# out="logs/headers.{rtype1}.{tool1}.{rtype2}.{tool2}.out.log",
# err="logs/headers.{rtype1}.{tool1}.{rtype2}.{tool2}.err.log"
# wildcard_constraints:
# rtype1="|".join(READ_TYPES),
# rtype2="|".join(READ_TYPES),
# tool1="|".join(ASSEMBLERS),
# tool2="|".join(ASSEMBLERS)
# shell:
# "(date && "
# "grep '>' {input} | sed 's/^.*\_\_//' | sed 's/\ .*//' > {output} && "
# "date) 2> {log.err} > {log.out}"
# rule zymo_headers:
# input:
# os.path.join(RESULTS_DIR, "analysis/diamond/{rtype}_{tool}.tsv")
# output:
# os.path.join(RESULTS_DIR,"analysis/headers/{rtype}_{tool}_zymo_headers.txt")
# log:
# out="logs/zymo_headers.{rtype}.{tool}.out.log",
# err="logs/zymo_headers.{rtype}.{tool}.err.log"
# wildcard_constraints:
# rtype="|".join(READ_TYPES),
# tool="|".join(ASSEMBLERS)
# shell:
# "(date && "
# "awk '{{print $1}}' {input} > {output} && "
# "date) 2> {log.err} > {log.out}"
# rule comparison:
# input:
# faa=os.path.join(DATA_DIR, "annotation/prodigal/{rtype2}/{tool2}/proteins.faa"),
# tsv=os.path.join(RESULTS_DIR, "analysis/diamond/{rtype2}_{tool2}.tsv"),
# cdhit=lambda wildcards: expand(
# os.path.join(DATA_DIR, "analysis/cdhit/{rtype1_tool1}__{{rtype2}}_{{tool2}}.faa"),
# rtype1_tool1=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS if rtype != wildcards.rtype2 and tool != wildcards.tool2])
# output:
# os.path.join(RESULTS_DIR, "analysis/comparison/{rtype2}_{tool2}_comparison.txt")
# wildcard_constraints:
# rtype1="|".join(READ_TYPES),
# rtype2="|".join(READ_TYPES),
# tool1="|".join(ASSEMBLERS),
# tool2="|".join(ASSEMBLERS)
# run:
# # getting protein ids
# pids = []
# with open(input.faa,'r') as fi:
# for ln in fi:
# if ln.startswith('>'):
# pids.append(re.sub("^>", "", ln.strip().split(" ")[0]))
# # creating empty dataframe
# df=pandas.DataFrame(False, index=pids, columns=["ref"] + [os.path.basename(fname).split("__")[0] for fname in input.cdhit])
# # parsing diamond hits
# ref_hits=pandas.read_csv(input.tsv, sep='\t', header=None)
# ref_hits.columns=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']
# df.loc[ref_hits["qseqid"],"ref"] = True
# # parse cdhit faa files
# for fname in input.cdhit:
# fname_col=os.path.basename(fname).split("__")[0] # column name to be used
# fname_pids=[]
# with open(fname,'r') as fi:
# for ln in fi:
# if ln.startswith('>'):
# nam=re.sub("^>", "", ln.strip().split(" ")[0])
# fname_pids.append(nam.strip().split("__")[2])
# df.loc[fname_pids,fname_col] = True
# # save
# df.to_csv(output[0], sep='\t', header=0)
summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False, na_rep="NA")
# #####################
# # Genome comparison #
Loading