diff --git a/2019_GDB/rules/PLOT_RULES b/2019_GDB/rules/PLOT_RULES new file mode 100755 index 0000000000000000000000000000000000000000..fdbb71a6d52dfb2bb6515d45020fe5d7a0cb8823 --- /dev/null +++ b/2019_GDB/rules/PLOT_RULES @@ -0,0 +1,48 @@ +# For running the MMSEQ2 comparison of proteins after assemblies are run through prokka/prodigal + +import os +#from tempfile import TemporaryDirectory + +configfile: "config/CONFIG.yaml" +#DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +#DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +#ASSEMBLERS=config["assemblers"] +#MAPPERS=["bwa", "mm"] +## SAMPLES=config["samples"] +#SAMPLES=["flye", "megahit", "metaspades_hybrid"] +#BINNING_SAMPLES=config["binning_samples"] +#HYBRID_ASSEMBLER=config["hybrid_assembler"] +SR_PREFIX="ONT3_MG_xx_Rashi_S11" + + +############################# +###### IGC COVERAGE ######## +############################# +rule all: + input: + expand("{results_dir}/plots/genomecov/{sr_prefix}_vs_{lr_prefix}-x-{reference}_coverage.html", results_dir=RESULTS_DIR, sr_prefix=SR_PREFIX, lr_prefix=BARCODES, reference="igc"), + expand("{results_dir}/plots/annotation/diamond/lr_{lr_prefix}-sr{sr_prefix}-gene_length_ratio.html", results_dir=RESULTS_DIR, sr_prefix=SR_PREFIX, lr_prefix=BARCODES) + + +rule igc_correlation_plot: + input: + sr_cov=os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_prefix}-x-{reference}.avg_cov.txt"), + lr_cov=os.path.join(RESULTS_DIR, "genomecov/lr/merged/{lr_prefix}/{lr_prefix}-x-{reference}.avg_cov.txt") + output: os.path.join(RESULTS_DIR, "plots/genomecov/{sr_prefix}_vs_{lr_prefix}-x-{reference}_coverage.html") + #log: os.path.join(RESULTS_DIR, "plots/mapping/sr_lr_igc_coverage.log") + conda: "../../envs/r_ggplot2_datatable_tidyverse.yaml" + script: + "../scripts/sr_lr_igc_coverage_correlation.Rmd" + +rule gene_length_ratio_plot: + input: + sr_diamond=os.path.join(RESULTS_DIR, "annotation/diamond/megahit/{sr_prefix}/final.contigs.tsv"), + lr_diamond=os.path.join(RESULTS_DIR, "annotation/diamond/flye/lr/merged/{lr_prefix}/assembly.tsv"), + hybrid_diamond=os.path.join(RESULTS_DIR, "annotation/diamond/metaspades_hybrid/lr_{lr_prefix}-sr_{sr_prefix}/contigs.tsv") + output: os.path.join(RESULTS_DIR, "plots/annotation/diamond/lr_{lr_prefix}-sr{sr_prefix}-gene_length_ratio.html") + #log: os.path.join(RESULTS_DIR, "plots/mapping/sr_lr_igc_coverage.log") + conda: "../../envs/r_ggplot2_datatable_tidyverse.yaml" + script: + "../scripts/gene_length_ratio.Rmd" diff --git a/2019_GDB/scripts/sr_lr_igc_coverage_correlation.R b/2019_GDB/scripts/sr_lr_igc_coverage_correlation.R new file mode 100755 index 0000000000000000000000000000000000000000..24591bd6448397803fbb290428503ee396b9efc4 --- /dev/null +++ b/2019_GDB/scripts/sr_lr_igc_coverage_correlation.R @@ -0,0 +1,49 @@ +#!/usr/bin/env R + +library(tidyverse) +library(data.table) + +### +# DATA ##### +### +sr_cov_filename <- snakemake@input[["sr_cov"]] +lr_cov_filename <- snakemake@input[["lr_cov"]] + +sr_cov <- fread(sr_cov_filename, header=F, sep=" ") # fread is much faster +lr_cov <- fread(lr_cov_filename, header=F, sep=" ") # fread is much faster + +# Prepare the data +sr_cov.non_zero <- sr_cov %>% + filter(V2 > 0) +lr_cov.non_zero <- lr_cov %>% + filter(V2 > 0) +sr_lr_cov.non_zero <- merge(sr_cov.non_zero, lr_cov.non_zero, by="V1") +sr_lr_cov.non_zero <- sr_lr_cov.non_zero %>% + filter(V2.x > 0 & V2.y > 0) + +# Compute the correlation +spearman_cor <- cor(sr_lr_cov.non_zero$V2.x, + sr_lr_cov.non_zero$V2.y, method="spearman") +pearson_cor <- cor(sr_lr_cov.non_zero$V2.x, + sr_lr_cov.non_zero$V2.y, method="pearson") + +# Print the correlations +print("Spearman correlation:") +print(spearman_cor) +print("Pearson correlation:") +print(pearson_cor) + +# Plot the coverages +p <- ggplot(sr_lr_cov.non_zero, aes(x = V2.x, y = V2.y)) + + geom_point(alpha = 0.1) + + ylab("Long read coverage") + + xlab("Short read coverage") + + ggtitle("IGC cov. corr. (SR & LR cov > 0)") + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank()) +pdf(snakemake@output[["plot"]]) +print(p) +dev.off() + + + diff --git a/rules/PLOT_RULES b/rules/PLOT_RULES new file mode 100755 index 0000000000000000000000000000000000000000..fdbb71a6d52dfb2bb6515d45020fe5d7a0cb8823 --- /dev/null +++ b/rules/PLOT_RULES @@ -0,0 +1,48 @@ +# For running the MMSEQ2 comparison of proteins after assemblies are run through prokka/prodigal + +import os +#from tempfile import TemporaryDirectory + +configfile: "config/CONFIG.yaml" +#DATA_DIR = config["data_dir"] +RESULTS_DIR = config["results_dir"] +#DB_DIR=config["db_dir"] +BARCODES=config["barcodes"] +#ASSEMBLERS=config["assemblers"] +#MAPPERS=["bwa", "mm"] +## SAMPLES=config["samples"] +#SAMPLES=["flye", "megahit", "metaspades_hybrid"] +#BINNING_SAMPLES=config["binning_samples"] +#HYBRID_ASSEMBLER=config["hybrid_assembler"] +SR_PREFIX="ONT3_MG_xx_Rashi_S11" + + +############################# +###### IGC COVERAGE ######## +############################# +rule all: + input: + expand("{results_dir}/plots/genomecov/{sr_prefix}_vs_{lr_prefix}-x-{reference}_coverage.html", results_dir=RESULTS_DIR, sr_prefix=SR_PREFIX, lr_prefix=BARCODES, reference="igc"), + expand("{results_dir}/plots/annotation/diamond/lr_{lr_prefix}-sr{sr_prefix}-gene_length_ratio.html", results_dir=RESULTS_DIR, sr_prefix=SR_PREFIX, lr_prefix=BARCODES) + + +rule igc_correlation_plot: + input: + sr_cov=os.path.join(RESULTS_DIR, "genomecov/sr/bwa_mem/{sr_prefix}-x-{reference}.avg_cov.txt"), + lr_cov=os.path.join(RESULTS_DIR, "genomecov/lr/merged/{lr_prefix}/{lr_prefix}-x-{reference}.avg_cov.txt") + output: os.path.join(RESULTS_DIR, "plots/genomecov/{sr_prefix}_vs_{lr_prefix}-x-{reference}_coverage.html") + #log: os.path.join(RESULTS_DIR, "plots/mapping/sr_lr_igc_coverage.log") + conda: "../../envs/r_ggplot2_datatable_tidyverse.yaml" + script: + "../scripts/sr_lr_igc_coverage_correlation.Rmd" + +rule gene_length_ratio_plot: + input: + sr_diamond=os.path.join(RESULTS_DIR, "annotation/diamond/megahit/{sr_prefix}/final.contigs.tsv"), + lr_diamond=os.path.join(RESULTS_DIR, "annotation/diamond/flye/lr/merged/{lr_prefix}/assembly.tsv"), + hybrid_diamond=os.path.join(RESULTS_DIR, "annotation/diamond/metaspades_hybrid/lr_{lr_prefix}-sr_{sr_prefix}/contigs.tsv") + output: os.path.join(RESULTS_DIR, "plots/annotation/diamond/lr_{lr_prefix}-sr{sr_prefix}-gene_length_ratio.html") + #log: os.path.join(RESULTS_DIR, "plots/mapping/sr_lr_igc_coverage.log") + conda: "../../envs/r_ggplot2_datatable_tidyverse.yaml" + script: + "../scripts/gene_length_ratio.Rmd" diff --git a/scripts/sr_lr_igc_coverage_correlation.R b/scripts/sr_lr_igc_coverage_correlation.R new file mode 100755 index 0000000000000000000000000000000000000000..24591bd6448397803fbb290428503ee396b9efc4 --- /dev/null +++ b/scripts/sr_lr_igc_coverage_correlation.R @@ -0,0 +1,49 @@ +#!/usr/bin/env R + +library(tidyverse) +library(data.table) + +### +# DATA ##### +### +sr_cov_filename <- snakemake@input[["sr_cov"]] +lr_cov_filename <- snakemake@input[["lr_cov"]] + +sr_cov <- fread(sr_cov_filename, header=F, sep=" ") # fread is much faster +lr_cov <- fread(lr_cov_filename, header=F, sep=" ") # fread is much faster + +# Prepare the data +sr_cov.non_zero <- sr_cov %>% + filter(V2 > 0) +lr_cov.non_zero <- lr_cov %>% + filter(V2 > 0) +sr_lr_cov.non_zero <- merge(sr_cov.non_zero, lr_cov.non_zero, by="V1") +sr_lr_cov.non_zero <- sr_lr_cov.non_zero %>% + filter(V2.x > 0 & V2.y > 0) + +# Compute the correlation +spearman_cor <- cor(sr_lr_cov.non_zero$V2.x, + sr_lr_cov.non_zero$V2.y, method="spearman") +pearson_cor <- cor(sr_lr_cov.non_zero$V2.x, + sr_lr_cov.non_zero$V2.y, method="pearson") + +# Print the correlations +print("Spearman correlation:") +print(spearman_cor) +print("Pearson correlation:") +print(pearson_cor) + +# Plot the coverages +p <- ggplot(sr_lr_cov.non_zero, aes(x = V2.x, y = V2.y)) + + geom_point(alpha = 0.1) + + ylab("Long read coverage") + + xlab("Short read coverage") + + ggtitle("IGC cov. corr. (SR & LR cov > 0)") + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank()) +pdf(snakemake@output[["plot"]]) +print(p) +dev.off() + + +