Rules for tables to compare different assemblers and their mapping to zymo

2578f3d8 · Susheel Busi · 3294ef44 · 2578f3d8 · 2578f3d8 · 2578f3d8
Commit 2578f3d8 authored 4 years ago by Susheel Busi
--- a/config/Zymo/config.zymo.yaml
+++ b/config/Zymo/config.zymo.yaml
@@ -2,6 +2,7 @@
 work_dir: "/scratch/users/sbusi/zymo"
 results_dir: "results"
 data_dir: "/scratch/users/sbusi/zymo/Zymo/results"
+ref_genomes_dir: "/scratch/users/sbusi/zymo/results/data_ref_genomes"

 # Assembly
 # List of assemblers for different read types: assembler names MUST be UNIQUE
@@ -14,7 +15,6 @@ assemblers:
 # Additional params   
 single_fast5_dir: "data_single_fast5"
 multi_fast5_dir: "data_multi_fast5"
-ref_genomes_dir: "data_ref_genomes"

 # Annotation tools
 prodigal:

--- a/config/Zymo/slurm.zymo.yaml
+++ b/config/Zymo/slurm.zymo.yaml
@@ -13,7 +13,7 @@ download_genomes:
  runtime: "0-00:15:00"

 concatenate:
-  partition: "interactive"
+  partition: "batch"
  quality: "normal"
  runtime: "0-00:05:00"


--- a/workflow_zymo/Snakefile
+++ b/workflow_zymo/Snakefile
@@ -3,6 +3,7 @@
 ##################################################
 import os
 import re
+import pandas as pd

 ##############################
 # CONFIG
@@ -70,7 +71,9 @@ rule all:
            rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS]),
        expand(os.path.join(RESULTS_DIR, "analysis/comparison/common_{combi}.txt"),
            combi=["%s_%s__%s_%s" % (p[0][0], p[0][1], p[1][0], p[1][1]) for p in READ_ASSEMBLER_PAIRS] +
-            ["%s_%s__%s_%s" % (p[1][0], p[1][1], p[0][0], p[0][1]) for p in READ_ASSEMBLER_PAIRS]) 
+            ["%s_%s__%s_%s" % (p[1][0], p[1][1], p[0][0], p[0][1]) for p in READ_ASSEMBLER_PAIRS]),
+        expand(os.path.join(RESULTS_DIR, "analysis/comparison/{rtype_tool}_comparison.txt"), 
+            rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS])


 rule download_genomes:
@@ -203,25 +206,44 @@ rule zymo_headers:

 rule comparison:
    input:
-        in1=os.path.join(RESULTS_DIR,"analysis/headers/{rtype1}_{tool1}__{rtype2}_{tool2}_headers.txt"),
-        in2=os.path.join(RESULTS_DIR,"analysis/headers/{rtype2}_{tool2}__{rtype1}_{tool1}_headers.txt")
+        faa=os.path.join(DATA_DIR, "annotation/prodigal/{rtype2}/{tool2}/proteins.faa"),
+        tsv=os.path.join(RESULTS_DIR, "analysis/diamond/{rtype2}_{tool2}.tsv"),
+        cdhit=lambda wildcards: expand(
+        os.path.join(DATA_DIR, "analysis/cdhit/{rtype1_tool1}__{{rtype2}}_{{tool2}}.faa"),
+        rtype1_tool1=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS if rtype != wildcards.rtype2 and tool != wildcards.tool2])
    output:
-        out1=os.path.join(RESULTS_DIR, "analysis/comparison/common_{rtype1}_{tool1}__{rtype2}_{tool2}.txt"),
-        out2=os.path.join(RESULTS_DIR, "analysis/comparison/{rtype1}_{tool1}_uniq__{rtype2}_{tool2}.txt"),
-        out3=os.path.join(RESULTS_DIR, "analysis/comparison/{rtype2}_{tool2}_uniq__{rtype1}_{tool1}.txt")
-    log:
-        out="logs/comparison.{rtype1}.{tool1}.{rtype2}.{tool2}.out.log",
-        err="logs/comparison.{rtype1}.{tool1}.{rtype2}.{tool2}.err.log"
+        os.path.join(RESULTS_DIR, "analysis/comparison/{rtype2}_{tool2}_comparison.txt")
    wildcard_constraints:
        rtype1="|".join(READ_TYPES),
        rtype2="|".join(READ_TYPES),
        tool1="|".join(ASSEMBLERS),
-        tool2="|".join(ASSEMBLERS)        
-    shell:
-        """
-        (date &&\
-        comm -12 <(sort {input.in1} | uniq) <(sort {input.in2} | uniq) > {output.out1} &&\
-        comm -23 <(sort {input.in1} | uniq) <(sort {input.in2} | uniq) > {output.out2} &&\
-        comm -13 <(sort {input.in1} | uniq) <(sort {input.in2} | uniq) > {output.out3} &&\
-        date) &> >(tee {log})
-        """
\ No newline at end of file
+        tool2="|".join(ASSEMBLERS)
+    run:
+        # getting protein ids
+        pids = []
+        with open(input.faa,'r') as fi:
+            for ln in fi:
+                if ln.startswith('>'):
+                    pids.append(re.sub("^>", "", ln.strip().split(" ")[0]))
+
+        # creating empty dataframe
+        df=pd.DataFrame(False, index=pids, columns=["ref"] + [os.path.basename(fname).split("__")[0] for fname in input.cdhit])
+
+        # parsing diamond hits
+        ref_hits=pd.read_csv(input.tsv, sep='\t', header=None)
+        ref_hits.columns=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']
+        df.loc[ref_hits["qseqid"],"ref"] = True
+
+        # parse cdhit faa files
+        for fname in input.cdhit:
+            fname_col=os.path.basename(fname).split("__")[0] # column name to be used
+            fname_pids=[]
+            with open(input.cdhit,'r') as fi:
+                for ln in fi:
+                    if ln.startswith('>'):
+                        fname_pids.append(re.sub("^>", "", ln.strip().split(" ")[0]))
+            
+        df.loc[fname_pids,fname_col] = True
+        
+        # save
+        df.to_csv(output[0], sep='\t', header=0)
\ No newline at end of file