From 1257fcdb99b0cbdd820cf5a0fbfa9bc41375d6b3 Mon Sep 17 00:00:00 2001
From: Valentina Galata <valentina.galata@uni.lu>
Date: Thu, 10 Sep 2020 09:00:28 +0200
Subject: [PATCH] issue #77: cov plot example

---
 notes/cdhit_uniq_cov.R  | 56 +++++++++++++++++++++++++++++++++
 notes/cdhit_uniq_cov.sh | 70 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 notes/cdhit_uniq_cov.R
 create mode 100644 notes/cdhit_uniq_cov.sh

diff --git a/notes/cdhit_uniq_cov.R b/notes/cdhit_uniq_cov.R
new file mode 100644
index 0000000..bd15532
--- /dev/null
+++ b/notes/cdhit_uniq_cov.R
@@ -0,0 +1,56 @@
+#!/usr/bin/Rscript
+
+## NOTE
+# XXX
+
+## ARGS
+ARGS    <- commandArgs(trailingOnly=TRUE)
+COV_G   <- ARGS[1]
+COV_T   <- ARGS[2]
+UNIQ    <- ARGS[3]
+GENES   <- ARGS[4]
+TITLE   <- ARGS[5]
+PDF     <- ARGS[6]
+cols_cov <- c("contig", "start", "end", "coverage")
+cols_gen <- c("gene", "start", "end", "strand", "info")
+
+## IMPORT
+suppressMessages(library(ggplot2)) # plotting
+suppressMessages(library(reshape2)) # reshaping dataframes
+suppressMessages(library(scales)) # plot scales
+suppressMessages(library(viridis)) # color palette
+
+## DATA
+# Coverage
+df_cov_G <- read.csv(file=COV_G, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_cov)
+df_cov_T <- read.csv(file=COV_T, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_cov)
+# Genes
+df_uniq         <- read.csv(file=UNIQ,  sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_gen)
+df_genes        <- read.csv(file=GENES, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_gen)
+df_genes$uniq   <- df_genes$gene %in% df_uniq$gene
+
+## PLOT
+cov_plot <- function(df_c, df_g){
+    max_y <- max(df_c$coverage)
+    pp <-
+        ggplot(data=df_c) +
+        geom_rect(data=df_g, aes(xmin=start, xmax=end, ymin=0, ymax=max_y, fill=uniq), alpha=0.2) +
+        geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=coverage), fill="black", colour=NA) +
+        geom_hline(yintercept=10, colour="red") +
+        scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
+        labs(
+            title=TITLE,
+            x="base",
+            y="coverage"
+        ) +
+        theme_bw()
+    return(pp)
+}
+
+pp_G <- cov_plot(df_cov_G, df_genes)
+pp_T <- cov_plot(df_cov_T, df_genes)
+
+pdf(PDF, width=60, height=4)
+print(pp_G)
+print(pp_T)
+dev.off()
diff --git a/notes/cdhit_uniq_cov.sh b/notes/cdhit_uniq_cov.sh
new file mode 100644
index 0000000..361a858
--- /dev/null
+++ b/notes/cdhit_uniq_cov.sh
@@ -0,0 +1,70 @@
+#!/bin/bash -l
+
+#SBATCH -J ONT_GDB
+#SBATCH -N 1
+#SBATCH -n 1
+#SBATCH -c 15
+#SBATCH --time=0-01:00:00
+#SBATCH -p batch
+#SBATCH --qos=qos-batch
+
+# NOTE: script to compute and plot metaT and metaG coverage for a specific assembly
+#       and genes not found in another assembly
+# NOTE: this is for testing purposes only !!!
+
+# Looking for conda env.s
+#   grep -r --include="*.yaml" "samtools" ~/miniconda3/envs/ONT_pilot/pipeline/
+# Contig cov. and length:
+#   sort -n -k2,2 ../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.cov.ave.txt | tail -n 10
+#   sort -n -k2,2 ../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.cov.ave.txt | tail -n 10 | cut -f1 -d" " | while read f; do grep "${f}" ../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.idxstats.txt; done
+
+# TEST: lr_flye, contig_446, sr_metaspades
+# TEST: lr_flye_sr, contig_446, sr_metaspades
+
+cd /mnt/lscratch/users/vgalata/ont_pilot/cdhit_cov_example
+
+rtype1="sr"; tool1="metaspades"
+rtype2="lr"; tool2="flye_sr"
+
+contig="contig_446"
+
+threads=15
+sr1="../results/preproc/metag/sr/R1.fastp.fastq.gz"
+sr2="../results/preproc/metag/sr/R2.fastp.fastq.gz"
+asm="../results/assembly/${rtype2}/${tool2}/ASSEMBLY.FILTERED.fasta"
+idx="../results/assembly/${rtype2}/${tool2}/ASSEMBLY.FILTERED"
+faa="../results/annotation/prodigal/${rtype2}/${tool2}/proteins.faa"
+uniq="../results/analysis/cdhit/${rtype1}_${tool1}__${rtype2}_${tool2}.faa"
+
+bam_metaG="${rtype2}_${tool2}.metaG.bam"
+bam_metaT="../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.bam"
+cov_metaG="${rtype2}_${tool2}.metaG.cov"
+cov_metaT="${rtype2}_${tool2}.metaT.cov"
+ids_all="${rtype2}_${tool2}_${contig}.ids"
+ids_uniq="${rtype2}_${tool2}_${contig}.uniq.ids"
+pdf="${rtype2}_${tool2}_${contig}.pdf"
+
+# BAMs
+# metaG, SR: as in the rules
+# metaT, SR: already exists
+conda activate /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/7070f59f
+bwa mem -t ${threads} ${idx} ${sr1} ${sr2} | samtools view -@ ${threads} -SbT ${asm} | samtools sort -@ ${threads} -m 4G -T ${bam_metaG%.*} > ${bam_metaG}
+conda deactivate
+
+# genome coverage in required format
+# https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html
+conda activate /scratch/users/vgalata/miniconda3/ONT_pilot/pipeline/121c497d
+bedtools genomecov -bg -ibam ${bam_metaG} > ${cov_metaG}
+bedtools genomecov -bg -ibam ${bam_metaT} > ${cov_metaT}
+conda deactivate
+awk -F"\t" '$1=="'${contig}'" {print $0}' ${cov_metaG} > ${cov_metaG}.${contig}
+awk -F"\t" '$1=="'${contig}'" {print $0}' ${cov_metaT} > ${cov_metaT}.${contig}
+
+# all/uniq genes from Flye
+grep "${contig}_" ${faa} | sed "s/^>${contig}_//;s/ # /\t/g" > ${ids_all}
+grep "^>${rtype2}__${tool2}__${contig}_" ${uniq} | sed "s/^>${rtype2}__${tool2}__${contig}_//;s/ # /\t/g" | sort -n -k1,1 > ${ids_uniq}
+
+# plot
+conda activate /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/e231f1b8
+Rscript /mnt/irisgpfs/users/vgalata/projects/ONT_pilot/notes/cdhit_uniq_cov.R ${cov_metaG}.${contig} ${cov_metaT}.${contig} ${ids_uniq} ${ids_all} "Proteins in ${rtype2}_${tool2} but not in ${rtype1}_${tool1}, contig ${contig}" ${pdf}
+conda deactivate
\ No newline at end of file
-- 
GitLab