diff --git a/notes/cdhit_uniq_cov.R b/notes/cdhit_uniq_cov.R new file mode 100644 index 0000000000000000000000000000000000000000..bd155325773fda3d3733aa121cbb5090f189d898 --- /dev/null +++ b/notes/cdhit_uniq_cov.R @@ -0,0 +1,56 @@ +#!/usr/bin/Rscript + +## NOTE +# XXX + +## ARGS +ARGS <- commandArgs(trailingOnly=TRUE) +COV_G <- ARGS[1] +COV_T <- ARGS[2] +UNIQ <- ARGS[3] +GENES <- ARGS[4] +TITLE <- ARGS[5] +PDF <- ARGS[6] +cols_cov <- c("contig", "start", "end", "coverage") +cols_gen <- c("gene", "start", "end", "strand", "info") + +## IMPORT +suppressMessages(library(ggplot2)) # plotting +suppressMessages(library(reshape2)) # reshaping dataframes +suppressMessages(library(scales)) # plot scales +suppressMessages(library(viridis)) # color palette + +## DATA +# Coverage +df_cov_G <- read.csv(file=COV_G, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_cov) +df_cov_T <- read.csv(file=COV_T, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_cov) +# Genes +df_uniq <- read.csv(file=UNIQ, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_gen) +df_genes <- read.csv(file=GENES, sep="\t", header=FALSE, stringsAsFactors=FALSE, col.names=cols_gen) +df_genes$uniq <- df_genes$gene %in% df_uniq$gene + +## PLOT +cov_plot <- function(df_c, df_g){ + max_y <- max(df_c$coverage) + pp <- + ggplot(data=df_c) + + geom_rect(data=df_g, aes(xmin=start, xmax=end, ymin=0, ymax=max_y, fill=uniq), alpha=0.2) + + geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=coverage), fill="black", colour=NA) + + geom_hline(yintercept=10, colour="red") + + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + + labs( + title=TITLE, + x="base", + y="coverage" + ) + + theme_bw() + return(pp) +} + +pp_G <- cov_plot(df_cov_G, df_genes) +pp_T <- cov_plot(df_cov_T, df_genes) + +pdf(PDF, width=60, height=4) +print(pp_G) +print(pp_T) +dev.off() diff --git a/notes/cdhit_uniq_cov.sh b/notes/cdhit_uniq_cov.sh new file mode 100644 index 0000000000000000000000000000000000000000..361a8580c3bb7bb6996fc14bcfc365e0ca5b769b --- /dev/null +++ b/notes/cdhit_uniq_cov.sh @@ -0,0 +1,70 @@ +#!/bin/bash -l + +#SBATCH -J ONT_GDB +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -c 15 +#SBATCH --time=0-01:00:00 +#SBATCH -p batch +#SBATCH --qos=qos-batch + +# NOTE: script to compute and plot metaT and metaG coverage for a specific assembly +# and genes not found in another assembly +# NOTE: this is for testing purposes only !!! + +# Looking for conda env.s +# grep -r --include="*.yaml" "samtools" ~/miniconda3/envs/ONT_pilot/pipeline/ +# Contig cov. and length: +# sort -n -k2,2 ../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.cov.ave.txt | tail -n 10 +# sort -n -k2,2 ../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.cov.ave.txt | tail -n 10 | cut -f1 -d" " | while read f; do grep "${f}" ../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.idxstats.txt; done + +# TEST: lr_flye, contig_446, sr_metaspades +# TEST: lr_flye_sr, contig_446, sr_metaspades + +cd /mnt/lscratch/users/vgalata/ont_pilot/cdhit_cov_example + +rtype1="sr"; tool1="metaspades" +rtype2="lr"; tool2="flye_sr" + +contig="contig_446" + +threads=15 +sr1="../results/preproc/metag/sr/R1.fastp.fastq.gz" +sr2="../results/preproc/metag/sr/R2.fastp.fastq.gz" +asm="../results/assembly/${rtype2}/${tool2}/ASSEMBLY.FILTERED.fasta" +idx="../results/assembly/${rtype2}/${tool2}/ASSEMBLY.FILTERED" +faa="../results/annotation/prodigal/${rtype2}/${tool2}/proteins.faa" +uniq="../results/analysis/cdhit/${rtype1}_${tool1}__${rtype2}_${tool2}.faa" + +bam_metaG="${rtype2}_${tool2}.metaG.bam" +bam_metaT="../results/mapping/metat/${rtype2}/${tool2}/ASSEMBLY.FILTERED.sr.bam" +cov_metaG="${rtype2}_${tool2}.metaG.cov" +cov_metaT="${rtype2}_${tool2}.metaT.cov" +ids_all="${rtype2}_${tool2}_${contig}.ids" +ids_uniq="${rtype2}_${tool2}_${contig}.uniq.ids" +pdf="${rtype2}_${tool2}_${contig}.pdf" + +# BAMs +# metaG, SR: as in the rules +# metaT, SR: already exists +conda activate /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/7070f59f +bwa mem -t ${threads} ${idx} ${sr1} ${sr2} | samtools view -@ ${threads} -SbT ${asm} | samtools sort -@ ${threads} -m 4G -T ${bam_metaG%.*} > ${bam_metaG} +conda deactivate + +# genome coverage in required format +# https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html +conda activate /scratch/users/vgalata/miniconda3/ONT_pilot/pipeline/121c497d +bedtools genomecov -bg -ibam ${bam_metaG} > ${cov_metaG} +bedtools genomecov -bg -ibam ${bam_metaT} > ${cov_metaT} +conda deactivate +awk -F"\t" '$1=="'${contig}'" {print $0}' ${cov_metaG} > ${cov_metaG}.${contig} +awk -F"\t" '$1=="'${contig}'" {print $0}' ${cov_metaT} > ${cov_metaT}.${contig} + +# all/uniq genes from Flye +grep "${contig}_" ${faa} | sed "s/^>${contig}_//;s/ # /\t/g" > ${ids_all} +grep "^>${rtype2}__${tool2}__${contig}_" ${uniq} | sed "s/^>${rtype2}__${tool2}__${contig}_//;s/ # /\t/g" | sort -n -k1,1 > ${ids_uniq} + +# plot +conda activate /home/users/vgalata/miniconda3/envs/ONT_pilot/pipeline/e231f1b8 +Rscript /mnt/irisgpfs/users/vgalata/projects/ONT_pilot/notes/cdhit_uniq_cov.R ${cov_metaG}.${contig} ${cov_metaT}.${contig} ${ids_uniq} ${ids_all} "Proteins in ${rtype2}_${tool2} but not in ${rtype1}_${tool1}, contig ${contig}" ${pdf} +conda deactivate \ No newline at end of file