Skip to content
Snippets Groups Projects
Commit 1991001e authored by Susheel Busi's avatar Susheel Busi
Browse files

adding updated files. Files for moving results folder, making...

adding updated files. Files for moving results folder, making mmseq_upset_plots and a test mmseq_snakefile
parent 34f7c373
No related branches found
No related tags found
1 merge request!20Major changes including plotting files
......@@ -220,7 +220,12 @@ rule CONVERT_MMSEQ2:
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_megahit.m8")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/flye_metaspades_hybrid.m8")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/megahit_metaspades_hybrid.m8"))
rule PLOT_MMSEQ2:
input:
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done")),
expand(os.path.join(RESULTS_DIR, "annotation/mmseq2/upset_plots.done"))
######
# RULES
######
......@@ -1339,13 +1344,39 @@ rule mmseq2_m8_format:
date) &> >(tee {log})
"""
rule mmseq_compare:
rule prepare_plot_files:
output:
touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/plot_files_ready.done"))
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/files_ready.mmseq2.log")
shell:
"""
(date &&\
./prepare_plot_files.sh &&\
date) &> >(tee {log})
"""
rule mmseq2_plots:
input:
f"{RESULTS_DIR}/annotation/mmseq2/{{assembler1}}_db",
f"{RESULTS_DIR}/annotation/mmseq2/{{assembler2}}_db"
plot1=os.path.join(RESULTS_DIR, "annotation/mmseq2/overlap_sizes.txt"),
plot2=os.path.join(RESULTS_DIR, "annotation/mmseq2/total.txt")
output:
f"{RESULTS_DIR}/annotation/mmseq2/{{assembler1}}__{{assembler2}}_rbh"
conda:
"cd-hit.yaml"
touch(os.path.join(RESULTS_DIR, "annotation/mmseq2/upset_plots.done"))
log: os.path.join(RESULTS_DIR, "annotation/mmseq2/plot.mmseq2.log")
conda: "renv.yaml"
shell:
"mmseqs rbh {input[0]} {input[1]} {output} --min-seq-id 0.9 mmseq2_tmp --threads 12"
"""
(date &&\
Rscript mmseq_plots.R {input.plot1} {input.plot2} &&\
date) &> >(tee {log})
"""
# rule mmseq_compare:
# input:
# f"{RESULTS_DIR}/annotation/mmseq2/{{assembler1}}_db",
# f"{RESULTS_DIR}/annotation/mmseq2/{{assembler2}}_db"
# output:
# f"{RESULTS_DIR}/annotation/mmseq2/{{assembler1}}__{{assembler2}}_rbh"
# conda:
# "cd-hit.yaml"
# shell:
# "mmseqs rbh {input[0]} {input[1]} {output} --min-seq-id 0.9 mmseq2_tmp --threads 12"
name: cd-hit
channels:
- conda-forge
- bioconda
- defaults
- r
- etetoolkit
- au-eoed
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=0_gnu
- bzip2=1.0.8=h516909a_2
- ca-certificates=2020.4.5.1=hecc5488_0
- cd-hit=4.8.1=h8b12597_3
- gawk=5.1.0=h516909a_0
- gettext=0.19.8.1=hc5be6a0_1002
- libffi=3.2.1=he1b5a44_1007
- libgcc-ng=9.2.0=h24d8f2e_2
- libgomp=9.2.0=h24d8f2e_2
- libidn2=2.3.0=h516909a_0
- libstdcxx-ng=9.2.0=hdf63c60_2
- libunistring=0.9.10=h14c3975_0
- llvm-openmp=8.0.1=hc9558a2_0
- mmseqs2=11.e1a1c=h2d02072_0
- openmp=8.0.1=0
- openssl=1.1.1f=h516909a_0
- wget=1.20.1=h22169c7_0
- zlib=1.2.11=h516909a_1006
prefix: /home/users/sbusi/apps/miniconda3/envs/cd-hit
name: renv
channels:
- conda-forge
- bioconda
- defaults
- r
- etetoolkit
- au-eoed
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=1_llvm
- _r-mutex=1.0.1=anacondar_1
- binutils_impl_linux-64=2.34=h53a641e_0
- binutils_linux-64=2.34=hc952b39_18
- bwidget=1.9.14=0
- bzip2=1.0.8=h516909a_2
- ca-certificates=2020.4.5.1=hecc5488_0
- cairo=1.14.12=h8948797_3
- curl=7.69.1=h33f0ec9_0
- fontconfig=2.13.0=h9420a91_0
- freetype=2.10.1=he06d7ca_0
- fribidi=1.0.9=h516909a_0
- gcc_impl_linux-64=7.3.0=hd420e75_5
- gcc_linux-64=7.3.0=h553295d_18
- gfortran_impl_linux-64=7.3.0=hdf63c60_5
- gfortran_linux-64=7.3.0=h553295d_18
- glib=2.63.1=h5a9c865_0
- graphite2=1.3.13=he1b5a44_1001
- gsl=2.4=h294904e_1006
- gxx_impl_linux-64=7.3.0=hdf63c60_5
- gxx_linux-64=7.3.0=h553295d_18
- harfbuzz=1.8.8=hffaf4a1_0
- icu=58.2=hf484d3e_1000
- jpeg=9c=h14c3975_1001
- krb5=1.17.1=h2fd8d38_0
- ld_impl_linux-64=2.34=h53a641e_0
- libblas=3.8.0=16_openblas
- libcblas=3.8.0=16_openblas
- libcurl=7.69.1=hf7181ac_0
- libedit=3.1.20181209=hc058e9b_0
- libffi=3.2.1=he1b5a44_1007
- libgcc-ng=9.2.0=h24d8f2e_2
- libgfortran-ng=7.3.0=hdf63c60_5
- libgomp=9.2.0=h24d8f2e_2
- libopenblas=0.3.9=h5ec1e0e_0
- libpng=1.6.37=hed695b0_1
- libssh2=1.8.2=h22169c7_2
- libstdcxx-ng=9.2.0=hdf63c60_2
- libtiff=4.1.0=hc7e4089_6
- libuuid=1.0.3=h1bed415_2
- libwebp-base=1.1.0=h516909a_3
- libxcb=1.13=h14c3975_1002
- libxml2=2.9.9=hea5a465_1
- llvm-openmp=10.0.0=hc9558a2_0
- lz4-c=1.9.2=he1b5a44_0
- make=4.3=h516909a_0
- ncurses=6.2=he6710b0_0
- openssl=1.1.1f=h516909a_0
- pango=1.42.4=h049681c_0
- pcre=8.44=he1b5a44_0
- pixman=0.38.0=h516909a_1003
- pthread-stubs=0.4=h14c3975_1001
- r-base=3.5.3=h26b83e4_0
- readline=8.0=h7b6447c_0
- tk=8.6.10=hed695b0_0
- tktable=2.10=h555a92e_3
- xorg-libxau=1.0.9=h14c3975_0
- xorg-libxdmcp=1.1.3=h516909a_0
- xz=5.2.5=h516909a_0
- zlib=1.2.11=h516909a_1006
- zstd=1.4.4=h6597ccf_3
prefix: /home/users/sbusi/apps/miniconda3/envs/renv
This diff is collapsed.
#!/usr/bin/env Rscript
# For creating upset plots using mmseqs2 output
# usage: Rscript mmseq_plots.R sizes_file.txt total_file.txt
args <- commandArgs(trailingOnly = TRUE)
# args<-commandArgs(TRUE)
# sizes-args[1]
# total<-args[2]
################
# Dependencies #
################
library(ggplot2)
library(tidyr)
library(dplyr)
library(readr)
library(readxl)
library(stringr)
library(cowplot)
library(tidyverse)
##########
## Data ##
##########
overlap_sizes <- read.table(args[1])
Total <- read.table(args[2])
sets <- c("flye", "megahit", "metaspades")
category <- c(
"flye",
"megahit",
"metaspades",
"flye_megahit",
"flye_metaspades",
"megahit_metaspades",
"flye_megahit_metaspades"
)
# Creating the "Overlap" file for the upset plots
Overlap <- expand.grid( #generate all the combinations
"sets" = sets,
"category" = category
) %>%
as.data.frame() %>% #determine the intersection: a character col of Y or N.
mutate(intersect = case_when(
str_detect(category, sets %>% as.character()) ~ "Y",
T ~ "N"
))
# Upper-left plot
upperleft <- Total %>%
ggplot(aes(x = V1, y= V2)) +
geom_hline(yintercept = -Inf, size = 1.5) +
geom_vline(xintercept = -Inf, size = 1.5) +
geom_bar(stat = "identity", aes(fill = V1), alpha = 0.8) +
geom_text(aes(label = as.character(V2)), size = 5, angle = 90, hjust = 0, y = 1, fontface = "bold") +
scale_fill_manual(values = c("orangered3", "seagreen", "dodgerblue2"), #this is my own custom palette
limits = c("flye", "megahit", "metaspades")) + #feel free to use something else
scale_x_discrete(labels = NULL) +
scale_y_continuous(labels = NULL) +
labs(x = NULL,
y = "set size") +
theme_minimal() +
theme(legend.position = "none") +
theme(text = element_text(size= 16, face="bold")) +
theme(axis.text.x=element_text(colour = "black", angle = 45, hjust = 1)) +
theme(axis.text.y=element_text(colour = "black")) +
theme(panel.grid = element_blank())
upperleft
# Lower-left plot
Overlap$category <- factor(Overlap$category)
lowerleft <- Overlap %>%
mutate(category = factor(rev(Overlap$category))) %>% #in the colored matrix the first y value appears in the bottom, so the order need to be reversed
ggplot(aes(x = sets, y = category))+
geom_tile(aes(fill = sets, alpha = intersect), color = "black", size = 1.5) +
scale_fill_manual(values = c("orangered1", "seagreen", "dodgerblue2"),
limits = c("flye", "megahit", "metaspades")) +
scale_alpha_manual(values = c(0.8, 0), #color the grid for Y, don't color for N.
limits = c("Y", "N")) +
scale_y_discrete(labels = NULL) +
scale_x_discrete(labels = rep(" ", length(Overlap$sets))) + #I left white space here for better alignment w/ extended plots
labs(x = " ", #white space for better alignment w/ right side plots
y = "overlap") +
theme_minimal() +
theme(legend.position = "none") +
theme(text = element_text(size= 16, face="bold")) +
theme(axis.text.x=element_text(colour = "black")) +
theme(axis.text.y=element_text(colour = "black")) +
theme(panel.grid = element_blank())
lowerleft
# Upper-right plot
Assembly_method <- c("flye", "megahit", "metaspades")
upperright <- get_legend(
Total %>%
ggplot(aes(x = V1, y= V2)) +
geom_hline(yintercept = -Inf, size = 1.5) +
geom_vline(xintercept = -Inf, size = 1.5) +
geom_bar(stat = "identity", aes(fill = Assembly_method), alpha = 0.9) +
geom_text(aes(label = "Assembly method"), size = 40, angle = 90, hjust = 0, y = 1, fontface = "bold") +
scale_fill_manual(values = c("orangered3", "seagreen", "dodgerblue2"), #this is my own custom palette
limits = c("flye", "megahit", "metaspades")) + #feel free to use something else
scale_x_discrete(labels = NULL) +
scale_y_continuous(labels = NULL) +
labs(x = NULL,
y = "set size") +
theme_minimal() +
theme(legend.position = "right") +
theme(text = element_text(size= 16, face="bold")) +
theme(axis.text.x=element_text(colour = "black", angle = 45, hjust = 1)) +
theme(axis.text.y=element_text(colour = "black")) +
theme(panel.grid = element_blank())
)
plot_grid(upperright)
# Lower-right plot
overlap_sizes <- overlap_sizes %>% mutate(V2 = factor(V2, levels = rev(V2))) #the order needs to be reversed for the figure to match
lowerright <- overlap_sizes %>%
ggplot(aes(x = V2, y = V1)) +
geom_hline(yintercept = -Inf, size = 1.5) +
geom_vline(xintercept = -Inf, size = 1.5) +
geom_bar(stat = "identity", fill = "grey80", color = NA, alpha = 0.8) +
geom_text(aes(label = V1, y = 0), size = 5, hjust = 0, vjust = 0.5, fontface = "bold") +
scale_y_continuous(breaks = c(0, max(overlap_sizes$V1)) ,
labels = rep(" ", 2)) + #I left white space here for better alignment w/ extended plots
scale_x_discrete(labels = NULL) +
labs(y = "intersect sizes",
x = NULL) +
theme_minimal() +
theme(text = element_text(size= 15, face="bold")) +
theme(axis.text.x=element_text(colour = "black", angle = 45, hjust = 1)) +
theme(axis.text.y=element_text(colour = "black")) +
theme(panel.grid = element_blank()) +
coord_flip()
lowerright
# Saving the plot
pdf(file="ONT_protein_assembler_mseq_Upset_plot.pdf")
plot_grid(upperleft, upperright, lowerleft, lowerright,
nrow = 2,
ncol = 2,
rel_heights = c(0.95, 1.5), #the more rows in the lower part, the longer it should be
rel_widths = c(0.95, 0.95))
dev.off()
#!/bin/bash -l
cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/
mv -vf results mod_basecalled_results
mkdir results
cp -vrf mod_basecalled_results/basecalled_NO_MOD results/basecalled
#!/bin/bash -l
cd results/annotation/mmseq2/
rm -v *.txt # ensuring previously created files do not exist
# getting the number of common sequences between all three assembly methods
awk '{print $0=$1"\t"$2}' flye_megahit.m8 > flye_megahit.txt
awk '{print $0=$1"\t"$2}' flye_metaspades_hybrid.m8 > flye_metaspades.txt
awk '{print $0=$1"\t"$2}' megahit_metaspades_hybrid.m8 > megahit_metaspades.txt
# merging two files based on common values in 1st column
awk '{print $0=$2"\t"$1}' flye_megahit.txt > flye_megahit_ordered.txt
join -j 1 -o 1.1,1.2,2.2 <(sort -k1 flye_megahit_ordered.txt) <(sort -k1 megahit_metaspades.txt) > flye_megahit_metaspades.txt
# preparing input "overlap_sizes.txt" file for plots
for file in *_db
do
wc -l "$file" >> mmseq_overlap_sizes.txt
done
for file in *.m8
do
wc -l "$file" >> mmseq_overlap_sizes.txt
done
wc -l flye_megahit_metaspades.txt >> mmseq_overlap_sizes.txt
sed 's/_db//g' mmseq_overlap_sizes.txt | sed 's/.m8//g' | sed 's/.txt//g' | sed 's/_hybrid//g' > overlap_sizes.txt
# creating the "total.txt" file
for file in *_db
do
wc -l "$file" >> temp_total.txt
done
awk '{print $0=$2"\t"$1}' temp_total.txt | sed 's/_db//g' | sed 's/_hybrid//g' > total.txt
date
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment