diff --git a/.gitignore b/.gitignore index 112189a0a6a312add8be1f3a52ad5d3f87026b5f..511f4a000b6c75e9bfe801c79a95887597bbe221 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,10 @@ figures/output/** figures/data/nanostats_summary.tsv figures/data/crispr_summary.tsv +figures/data/plasflow_summary.tsv +figures/data/rgi_summary.tsv **/.snakemake/ +**/__pycache__/ slurm-*.out *.tmp *.log \ No newline at end of file diff --git a/2019_GDB/scripts/get_mappability.sh b/2019_GDB/scripts/get_mappability.sh new file mode 100644 index 0000000000000000000000000000000000000000..6435129f0f48ed64d388c6ee62909d2baf557648 --- /dev/null +++ b/2019_GDB/scripts/get_mappability.sh @@ -0,0 +1,30 @@ +#!/bin/bash -l +# Usage: sbatch -t 1:00:00 -N 1 -n 4 get_mappability.sh + +date +cd /scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/mapping +conda activate anvio6 # or any environment that as "samtools" + +# metaG +samtools flagstat sr/bwa_mem/ONT3_MG_xx_Rashi_S11_reads-x-ONT3_MG_xx_Rashi_S11-megahit_contigs.bam > sr_mappability.txt +samtools flagstat lr/merged/no_barcode/no_barcode_reads-x-no_barcode-flye_contigs.bam > lr_mappability.txt + +samtools flagstat bwa_merged_metaspades_hybrid/bwa_merged_metaspades_hybrid.bam > hybrid_bwa_merged.txt +samtools flagstat bwa_sr_metaspades_hybrid/bwa_sr_metaspades_hybrid.bam > hybrid_bwa_sr.txt +samtools flagstat bwa_lr_metaspades_hybrid/bwa_lr_metaspades_hybrid.bam > hybrid_bwa_lr.txt + +samtools flagstat mmi_merged_metaspades_hybrid/mmi_merged_metaspades_hybrid.bam > hybrid_mmi_merged.txt +samtools flagstat mmi_sr_metaspades_hybrid/mmi_sr_metaspades_hybrid.bam > hybrid_mmi_sr.txt +samtools flagstat mmi_lr_metaspades_hybrid/mmi_lr_metaspades_hybrid.bam > hybrid_mmi_lr.txt + +# metaT +samtools flagstat ./metaT/sr/FastSelectHalf1_MT_Rashi_S12_reads-x-lr_no_barcode_sr_ONT3_MG_xx_Rashi_S11-metaspades_hybrid_contigs.bam > hybrid_metaT_sr.txt +samtools flagstat ./metaT/sr/FastSelectHalf1_MT_Rashi_S12_reads-x-ONT3_MG_xx_Rashi_S11-megahit_contigs.bam > megahit_metaT_sr.txt + +# collating all the results in one place +for file in *.txt; do echo $file; egrep 'total | mapped' $file | head -n 2; done | paste - - - | \ + sed $'1 i\\\nsample\ttotal\tmapped\tpercent_mapped' | sed 's/ + 0 in total (QC-passed reads + QC-failed reads)//g' | \ + sed s'/ + 0 mapped//g' | sed 's/ : N\/A)//g' | sed s'/(//g' | sed s'/.txt//g' | \ + awk '{print$0=$1"\t"$2"\t"$3"\t"$4}' > mappability_index.tsv + +date \ No newline at end of file diff --git a/config/CONFIG.yaml b/config/CONFIG.yaml index aab8911bf4efe9daef1f357fbf111082335ffbd0..94b70d8b5571e73a4f9748709f2aa01bd9fce1f3 100755 --- a/config/CONFIG.yaml +++ b/config/CONFIG.yaml @@ -1,5 +1,5 @@ -# steps: "assembly_annotation mapping metaT mmseq binning taxonomy analysis" -steps: "binning taxonomy analysis" +# steps: "assembly_annotation mapping metaT mmseq binning taxonomy" +steps: "mapping metaT mmseq binning taxonomy" data_dir: "data" results_dir: "results" db_dir: "dbs" diff --git a/figures/README.md b/figures/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2bd5bb5c9465b3e1e0294ca2631707738d0fa69c --- /dev/null +++ b/figures/README.md @@ -0,0 +1,24 @@ +# About + +This directory contains code to create figures for the manuscript. + +- `data/`: data for the plots + - it is not required, the paths to the files can be set in `figures.yml` +- `envs/`: `conda` YAML files + - `envs/R.yml`: R-packages required for the figures + - `envs/figures.yml`: requirements to run the `Snakemake` pipeline +- `src/`: scripts required to generate the figures +- `src_SBB/`: scripts used by S. B. B. to generate figures + - for backup purposes +- `figures.smk`: `Snakemake` file to generate figures +- `figures.yml`: config file for `figures.smk` +- `README.md`: this file + +# Generating figures + +```bash +conda create -f envs/figures.yml +conda activate ont_figures +# rm -n to execute the steps +snakemake -s figures.smk --cores 1 --use-conda -r -p -n +``` \ No newline at end of file diff --git a/figures/data/.gitkeep b/figures/data/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/figures/data/2019_GDB_protein_partial_genes.txt b/figures/data/2019_GDB_protein_partial_genes.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbef47e79649f61fc7e02eeebda15c4fd05f1c4a --- /dev/null +++ b/figures/data/2019_GDB_protein_partial_genes.txt @@ -0,0 +1,5 @@ +tool metaspades metaspades_hybrid +prodigal_partial 251907 169318 +prodigal_total 365813 341322 +cdhit_unique 27526 63911 +cdhit_total 365813 341322 diff --git a/figures/data/bwa_lr_metaspades_hybrid_HQ_bins.txt b/figures/data/bwa_lr_metaspades_hybrid_HQ_bins.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fbf7e9eab71d9d3df5689304370374bb96a207f --- /dev/null +++ b/figures/data/bwa_lr_metaspades_hybrid_HQ_bins.txt @@ -0,0 +1,11 @@ +maxbin_output.024 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__Eubacterium_R;s__Eubacterium_R 97.32 2.48 0.00 +maxbin_output.021 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Ruminococcus_D;s__Ruminococcus_D 98.63 0.00 0.00 +maxbin_output.015 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor;s__Acetatifactor 98.07 3.51 8.33 +maxbin_output.020 d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Phascolarctobacterium;s__Phascolarctobacterium 99.98 2.89 0.00 +maxbin_output.001 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides 99.18 0.50 0.00 +maxbin_output.022 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Barnesiellaceae;g__Barnesiella;s__Barnesiella 95.85 0.38 0.00 +maxbin_output.027 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas 94.98 2.31 33.33 +maxbin_output.031 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Odoribacter;s__Odoribacter 88.51 2.69 0.00 +maxbin_output.007 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 100.00 5.74 0.00 +maxbin_output.012 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Tidjanibacter;s__Tidjanibacter 99.52 1.44 0.00 +maxbin_output.023 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__;s__ 95.97 0.67 0.00 diff --git a/figures/data/bwa_merged_metaspades_hybrid_HQ_bins.txt b/figures/data/bwa_merged_metaspades_hybrid_HQ_bins.txt new file mode 100644 index 0000000000000000000000000000000000000000..26f9e6ada62f07a566238928ed8d8f7e85ee14ce --- /dev/null +++ b/figures/data/bwa_merged_metaspades_hybrid_HQ_bins.txt @@ -0,0 +1,27 @@ +bwa_merged_metaspades_hybrid.metabat.58 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__Eubacterium_R;s__Eubacterium_R 97.32 0.00 0.00 +maxbin_output.020 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Ruminococcus_D;s__Ruminococcus_D 98.63 0.48 0.00 +bwa_merged_metaspades_hybrid.metabat.26 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium;s__Faecalibacterium 94.29 0.00 0.00 +bwa_merged_metaspades_hybrid.metabat.6 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__Oscillibacter;s__Oscillibacter 87.58 1.85 75.00 +bwa_merged_metaspades_hybrid.metabat.25 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__TANB77;f__CAG-508;g__CAG-492;s__CAG-492 71.52 5.01 7.69 +bwa_merged_metaspades_hybrid.metabat.22 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor;s__Acetatifactor 97.58 1.45 0.00 +bwa_merged_metaspades_hybrid.metabat.5_sub d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Faecalicatena;s__Faecalicatena 89.13 1.29 75.00 +bwa_merged_metaspades_hybrid.metabat.17 d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Succiniclasticum;s__Succiniclasticum 78.13 0.00 0.00 +maxbin_output.024 d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Phascolarctobacterium;s__Phascolarctobacterium 99.98 2.89 0.00 +bwa_merged_metaspades_hybrid.metabat.54 d__Bacteria;p__Cyanobacteria;c__Vampirovibrionia;o__Gastranaerophilales;f__Gastranaerophilaceae;g__CAG-196;s__CAG-196 77.34 1.66 0.00 +bwa_merged_metaspades_hybrid.metabat.34 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__UBA6398;s__UBA6398 88.41 1.28 33.33 +maxbin_output.002 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides 67.71 0.00 0.00 +bwa_merged_metaspades_hybrid.metabat.43 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_A;s__Bacteroides_A 86.91 1.67 40.00 +bwa_merged_metaspades_hybrid.metabat.32_sub d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_B;s__Bacteroides_B 92.95 1.67 50.00 +bwa_merged_metaspades_hybrid.metabat.80 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Tannerellaceae;g__Parabacteroides;s__Parabacteroides 78.08 0.13 0.00 +bwa_merged_metaspades_hybrid.metabat.55 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Barnesiellaceae;g__Barnesiella;s__Barnesiella 95.85 0.00 0.00 +bwa_merged_metaspades_hybrid.metabat.2 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Coprobacteraceae;g__Coprobacter;s__Coprobacter 91.32 0.57 0.00 +bwa_merged_metaspades_hybrid.metabat.45 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas 97.49 1.41 40.00 +maxbin_output.038_sub d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Odoribacter;s__Odoribacter 87.97 3.49 0.00 +bwa_merged_metaspades_hybrid.metabat.42 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 83.17 0.00 0.00 +maxbin_output.029 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 87.67 5.50 27.27 +maxbin_output.023 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 98.53 5.53 66.67 +maxbin_output.014 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Tidjanibacter;s__Tidjanibacter 99.52 1.44 0.00 +bwa_merged_metaspades_hybrid.metabat.73 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes_A;s__Alistipes_A 80.91 1.12 0.00 +bwa_merged_metaspades_hybrid.metabat.57 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Parasutterella;s__Parasutterella 81.43 1.25 40.00 +maxbin_output.026 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__;s__ 93.96 0.67 0.00 +bwa_merged_metaspades_hybrid.metabat.63 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__CAG-103;s__ 79.47 0.75 66.67 diff --git a/figures/data/bwa_sr_metaspades_hybrid_HQ_bins.txt b/figures/data/bwa_sr_metaspades_hybrid_HQ_bins.txt new file mode 100644 index 0000000000000000000000000000000000000000..f66892dce99d58b96dc77da9b53e9fbf72ee30af --- /dev/null +++ b/figures/data/bwa_sr_metaspades_hybrid_HQ_bins.txt @@ -0,0 +1,26 @@ +bwa_sr_metaspades_hybrid.metabat.37 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__Eubacterium_R;s__Eubacterium_R 97.32 0.00 0.00 +maxbin_output.014 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Ruminococcus_D;s__Ruminococcus_D 98.63 0.00 0.00 +bwa_sr_metaspades_hybrid.metabat.14 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium;s__Faecalibacterium 94.29 0.00 0.00 +bwa_sr_metaspades_hybrid.metabat.7 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__Oscillibacter;s__Oscillibacter 87.58 1.85 75.00 +bwa_sr_metaspades_hybrid.metabat.27 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__TANB77;f__CAG-508;g__CAG-492;s__CAG-492 71.52 6.06 6.25 +bwa_sr_metaspades_hybrid.metabat.24 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor;s__Acetatifactor 98.31 1.93 16.67 +bwa_sr_metaspades_hybrid.metabat.6 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Faecalicatena;s__Faecalicatena 94.39 1.29 75.00 +bwa_sr_metaspades_hybrid.metabat.32_sub d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Succiniclasticum;s__Succiniclasticum 76.94 0.00 0.00 +maxbin_output.020 d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Phascolarctobacterium;s__Phascolarctobacterium 99.98 2.89 0.00 +bwa_sr_metaspades_hybrid.metabat.33 d__Bacteria;p__Cyanobacteria;c__Vampirovibrionia;o__Gastranaerophilales;f__Gastranaerophilaceae;g__CAG-196;s__CAG-196 77.34 0.38 0.00 +bwa_sr_metaspades_hybrid.metabat.42 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__UBA6398;s__UBA6398 88.41 1.28 33.33 +maxbin_output.001 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides 99.23 0.50 0.00 +maxbin_output.004 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_B;s__Bacteroides_B 91.91 4.67 21.05 +maxbin_output.016 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Tannerellaceae;g__Parabacteroides;s__Parabacteroides 94.94 1.63 0.00 +bwa_sr_metaspades_hybrid.metabat.28 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Barnesiellaceae;g__Barnesiella;s__Barnesiella 95.85 0.00 0.00 +bwa_sr_metaspades_hybrid.metabat.3 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Coprobacteraceae;g__Coprobacter;s__Coprobacter 91.32 0.38 0.00 +bwa_sr_metaspades_hybrid.metabat.46 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas 97.49 1.41 40.00 +maxbin_output.034 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Odoribacter;s__Odoribacter 91.92 7.36 0.00 +maxbin_output.005 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 99.52 0.48 0.00 +bwa_sr_metaspades_hybrid.metabat.50_sub d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 98.04 1.44 66.67 +maxbin_output.011 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Tidjanibacter;s__Tidjanibacter 99.52 1.92 0.00 +bwa_sr_metaspades_hybrid.metabat.71 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes_A;s__Alistipes_A 80.91 1.12 0.00 +bwa_sr_metaspades_hybrid.metabat.55_sub d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Parasutterella;s__Parasutterella 81.43 1.25 40.00 +maxbin_output.022 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__;s__ 95.97 0.67 0.00 +bwa_sr_metaspades_hybrid.metabat.53 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__CAG-170;s__ 90.71 5.50 25.00 +bwa_sr_metaspades_hybrid.metabat.59 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__CAG-103;s__ 84.75 0.73 50.00 diff --git a/figures/data/flye_HQ_bins.txt b/figures/data/flye_HQ_bins.txt new file mode 100644 index 0000000000000000000000000000000000000000..e464429dfa01c75362de93134985b0b486166da1 --- /dev/null +++ b/figures/data/flye_HQ_bins.txt @@ -0,0 +1 @@ +maxbin_output.001 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides 67.45 2.58 55.56 diff --git a/figures/data/mappability_over_1000bp.txt b/figures/data/mappability_over_1000bp.txt new file mode 100644 index 0000000000000000000000000000000000000000..75191afdea81bdada247d0c9616afe39a573f5f0 --- /dev/null +++ b/figures/data/mappability_over_1000bp.txt @@ -0,0 +1,11 @@ +sample mapped_reads total percent_mapped +flye_counts.txt 7144171 8351240 85.5462 +hybrid_bwa_lr_counts.txt 7347760 8299732 88.5301 +hybrid_bwa_merged_counts.txt 57368285 60114498 95.4317 +hybrid_bwa_sr_counts.txt 50020525 51814766 96.5372 +hybrid_metaT_sr_counts.txt 48979035 78868203 62.1024 +hybrid_mmi_lr_counts.txt 7044114 8402725 83.8313 +hybrid_mmi_merged_counts.txt 59404046 63790314 93.1239 +hybrid_mmi_sr_counts.txt 52359932 55387589 94.5337 +megahit_counts.txt 48800047 51826913 94.1597 +megahit_metaT_sr_counts.txt 45584230 78531498 58.0458 diff --git a/figures/data/mappability_over_1500bp.txt b/figures/data/mappability_over_1500bp.txt new file mode 100644 index 0000000000000000000000000000000000000000..961cdebb9460a460615c1757a1b2899cadf5da11 --- /dev/null +++ b/figures/data/mappability_over_1500bp.txt @@ -0,0 +1,11 @@ +sample mapped_reads total percent_mapped +flye_counts.txt 7130256 8351240 85.3796 +hybrid_bwa_lr_counts.txt 7265218 8299732 87.5356 +hybrid_bwa_merged_counts.txt 56903252 60114498 94.6581 +hybrid_bwa_sr_counts.txt 49638034 51814766 95.799 +hybrid_metaT_sr_counts.txt 45743715 78868203 58.0002 +hybrid_mmi_lr_counts.txt 6953663 8402725 82.7549 +hybrid_mmi_merged_counts.txt 58859670 63790314 92.2705 +hybrid_mmi_sr_counts.txt 51906007 55387589 93.7141 +megahit_counts.txt 47762664 51826913 92.158 +megahit_metaT_sr_counts.txt 36898046 78531498 46.985 diff --git a/figures/data/mappability_over_2000bp.txt b/figures/data/mappability_over_2000bp.txt new file mode 100644 index 0000000000000000000000000000000000000000..a10b62f93f99823023718287629d5393e3a4a3ff --- /dev/null +++ b/figures/data/mappability_over_2000bp.txt @@ -0,0 +1,11 @@ +sample mapped_reads total percent_mapped +flye_counts.txt 7121527 8351240 85.2751 +hybrid_bwa_lr_counts.txt 7200635 8299732 86.7574 +hybrid_bwa_merged_counts.txt 56512238 60114498 94.0077 +hybrid_bwa_sr_counts.txt 49311603 51814766 95.169 +hybrid_metaT_sr_counts.txt 42538190 78868203 53.9358 +hybrid_mmi_lr_counts.txt 6887067 8402725 81.9623 +hybrid_mmi_merged_counts.txt 58426554 63790314 91.5916 +hybrid_mmi_sr_counts.txt 51539487 55387589 93.0524 +megahit_counts.txt 46946179 51826913 90.5826 +megahit_metaT_sr_counts.txt 32506911 78531498 41.3935 diff --git a/figures/data/mappability_over_5000bp.txt b/figures/data/mappability_over_5000bp.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa74f416825c4db4c6e4120f36c20b2d1b049bb9 --- /dev/null +++ b/figures/data/mappability_over_5000bp.txt @@ -0,0 +1,11 @@ +sample mapped_reads total percent_mapped +flye_counts.txt 7066935 8351240 84.6214 +hybrid_bwa_lr_counts.txt 6934846 8299732 83.5551 +hybrid_bwa_merged_counts.txt 54644383 60114498 90.9005 +hybrid_bwa_sr_counts.txt 47709537 51814766 92.0771 +hybrid_metaT_sr_counts.txt 29346178 78868203 37.2091 +hybrid_mmi_lr_counts.txt 6611655 8402725 78.6847 +hybrid_mmi_merged_counts.txt 56320978 63790314 88.2908 +hybrid_mmi_sr_counts.txt 49709323 55387589 89.7481 +megahit_counts.txt 43437290 51826913 83.8122 +megahit_metaT_sr_counts.txt 19650549 78531498 25.0225 diff --git a/figures/data/megahit_HQ_bins.txt b/figures/data/megahit_HQ_bins.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f60ed6f510c51ae03f68d2960898b54b75f1c34 --- /dev/null +++ b/figures/data/megahit_HQ_bins.txt @@ -0,0 +1,13 @@ +megahit.metabat.12 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__Eubacterium_R;s__Eubacterium_R 98.66 1.68 0.00 +megahit.metabat.38 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Ruminococcus_D;s__Ruminococcus_D 97.26 0.68 100.00 +megahit.metabat.31 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor;s__Acetatifactor 98.87 1.81 28.57 +megahit.metabat.18 d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Phascolarctobacterium;s__Phascolarctobacterium 98.18 1.70 0.00 +maxbin_output.002 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides 83.95 2.11 20.00 +megahit.metabat.5 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Tannerellaceae;g__Parabacteroides;s__Parabacteroides 91.15 1.09 75.00 +maxbin_output.027 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Barnesiellaceae;g__Barnesiella;s__Barnesiella 92.10 1.57 42.86 +megahit.metabat.36 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Coprobacteraceae;g__Coprobacter;s__Coprobacter 98.11 0.77 0.00 +megahit.metabat.13_sub d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas 80.54 2.96 14.29 +maxbin_output.009 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 96.10 4.90 52.94 +maxbin_output.023 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 92.77 6.96 42.86 +megahit.metabat.43 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Tidjanibacter;s__Tidjanibacter 99.52 0.24 100.00 +megahit.metabat.45 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__;s__ 93.51 0.67 0.00 diff --git a/figures/data/mmi_lr_metaspades_hybrid_HQ_bins.txt b/figures/data/mmi_lr_metaspades_hybrid_HQ_bins.txt new file mode 100644 index 0000000000000000000000000000000000000000..68dd7152f35ea2073ad072197a2c2dcfd9d58f26 --- /dev/null +++ b/figures/data/mmi_lr_metaspades_hybrid_HQ_bins.txt @@ -0,0 +1,12 @@ +maxbin_output.025 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__Eubacterium_R;s__Eubacterium_R 97.32 1.81 0.00 +maxbin_output.022 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Ruminococcus_D;s__Ruminococcus_D 98.63 0.00 0.00 +maxbin_output.016 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor;s__Acetatifactor 97.58 4.35 18.18 +maxbin_output.021 d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Phascolarctobacterium;s__Phascolarctobacterium 99.98 2.89 0.00 +maxbin_output.001 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides 99.23 0.50 0.00 +maxbin_output.023 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Barnesiellaceae;g__Barnesiella;s__Barnesiella 95.85 0.38 0.00 +maxbin_output.028 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas 94.98 2.84 28.57 +maxbin_output.033 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Odoribacter;s__Odoribacter 88.78 2.15 0.00 +maxbin_output.007 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 99.52 0.48 0.00 +maxbin_output.020 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 98.53 2.88 33.33 +maxbin_output.012 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Tidjanibacter;s__Tidjanibacter 99.52 1.44 0.00 +maxbin_output.024 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__;s__ 95.97 0.67 0.00 diff --git a/figures/data/mmi_merged_metaspades_hybrid_HQ_bins.txt b/figures/data/mmi_merged_metaspades_hybrid_HQ_bins.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7dff0584af12f8316176438fadb46ef6ccfcf9b --- /dev/null +++ b/figures/data/mmi_merged_metaspades_hybrid_HQ_bins.txt @@ -0,0 +1,26 @@ +mmi_merged_metaspades_hybrid.metabat.59 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__Eubacterium_R;s__Eubacterium_R 97.32 0.00 0.00 +maxbin_output.017 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Ruminococcus_D;s__Ruminococcus_D 98.63 0.00 0.00 +mmi_merged_metaspades_hybrid.metabat.69 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium;s__Faecalibacterium 94.29 0.00 0.00 +mmi_merged_metaspades_hybrid.metabat.49 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__Oscillibacter;s__Oscillibacter 87.58 1.85 75.00 +mmi_merged_metaspades_hybrid.metabat.20 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__TANB77;f__CAG-508;g__CAG-492;s__CAG-492 70.94 5.24 8.33 +mmi_merged_metaspades_hybrid.metabat.73 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor;s__Acetatifactor 98.31 1.93 16.67 +mmi_merged_metaspades_hybrid.metabat.28 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Faecalicatena;s__Faecalicatena 90.88 2.07 37.50 +mmi_merged_metaspades_hybrid.metabat.47 d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Succiniclasticum;s__Succiniclasticum 78.13 0.00 0.00 +maxbin_output.021 d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Phascolarctobacterium;s__Phascolarctobacterium 100.00 3.19 0.00 +mmi_merged_metaspades_hybrid.metabat.17 d__Bacteria;p__Cyanobacteria;c__Vampirovibrionia;o__Gastranaerophilales;f__Gastranaerophilaceae;g__CAG-196;s__CAG-196 76.49 0.38 0.00 +mmi_merged_metaspades_hybrid.metabat.65 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__UBA6398;s__UBA6398 87.30 1.28 33.33 +mmi_merged_metaspades_hybrid.metabat.19 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_A;s__Bacteroides_A 86.53 0.56 50.00 +mmi_merged_metaspades_hybrid.metabat.82 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_B;s__Bacteroides_B 93.32 8.74 20.00 +mmi_merged_metaspades_hybrid.metabat.46 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Barnesiellaceae;g__Barnesiella;s__Barnesiella 95.85 0.00 0.00 +mmi_merged_metaspades_hybrid.metabat.5 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Coprobacteraceae;g__Coprobacter;s__Coprobacter 91.32 0.57 0.00 +maxbin_output.029 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas 92.29 0.69 66.67 +mmi_merged_metaspades_hybrid.metabat.72_sub d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Odoribacter;s__Odoribacter 88.78 3.23 0.00 +mmi_merged_metaspades_hybrid.metabat.40 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 83.17 0.00 0.00 +maxbin_output.025_sub d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 92.51 7.10 26.92 +mmi_merged_metaspades_hybrid.metabat.48_sub d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 98.04 1.44 66.67 +maxbin_output.012 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Tidjanibacter;s__Tidjanibacter 99.52 1.44 0.00 +mmi_merged_metaspades_hybrid.metabat.64 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes_A;s__Alistipes_A 80.91 1.12 0.00 +mmi_merged_metaspades_hybrid.metabat.23 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Parasutterella;s__Parasutterella 81.43 1.25 40.00 +maxbin_output.023 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__;s__ 95.97 0.67 0.00 +maxbin_output.035_sub d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__CAG-170;s__ 85.61 5.61 21.43 +mmi_merged_metaspades_hybrid.metabat.56 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__CAG-103;s__ 85.42 1.40 33.33 diff --git a/figures/data/mmi_sr_metaspades_hybrid_HQ_bins.txt b/figures/data/mmi_sr_metaspades_hybrid_HQ_bins.txt new file mode 100644 index 0000000000000000000000000000000000000000..996a53a4b5aa19f9f7639dc6891e62a56e5edcb3 --- /dev/null +++ b/figures/data/mmi_sr_metaspades_hybrid_HQ_bins.txt @@ -0,0 +1,27 @@ +mmi_sr_metaspades_hybrid.metabat.61 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__Eubacterium_R;s__Eubacterium_R 97.32 0.00 0.00 +maxbin_output.016 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Ruminococcus_D;s__Ruminococcus_D 98.63 0.00 0.00 +mmi_sr_metaspades_hybrid.metabat.72 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium;s__Faecalibacterium 94.29 0.00 0.00 +mmi_sr_metaspades_hybrid.metabat.50 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__Oscillibacter;s__Oscillibacter 87.58 1.85 75.00 +mmi_sr_metaspades_hybrid.metabat.82 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__TANB77;f__CAG-508;g__CAG-492;s__CAG-492 71.52 7.11 6.25 +mmi_sr_metaspades_hybrid.metabat.57 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Acetatifactor;s__Acetatifactor 98.31 1.45 0.00 +mmi_sr_metaspades_hybrid.metabat.10 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Faecalicatena;s__Faecalicatena 90.88 1.68 50.00 +mmi_sr_metaspades_hybrid.metabat.48_sub d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Succiniclasticum;s__Succiniclasticum 77.53 0.00 0.00 +maxbin_output.022 d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Acidaminococcales;f__Acidaminococcaceae;g__Phascolarctobacterium;s__Phascolarctobacterium 99.98 2.89 0.00 +mmi_sr_metaspades_hybrid.metabat.45 d__Bacteria;p__Cyanobacteria;c__Vampirovibrionia;o__Gastranaerophilales;f__Gastranaerophilaceae;g__CAG-196;s__CAG-196 77.34 0.81 0.00 +mmi_sr_metaspades_hybrid.metabat.67 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__UBA6398;s__UBA6398 87.30 1.28 33.33 +maxbin_output.001 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides 99.23 0.50 0.00 +mmi_sr_metaspades_hybrid.metabat.41 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_A;s__Bacteroides_A 86.53 0.56 50.00 +mmi_sr_metaspades_hybrid.metabat.80 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides_B;s__Bacteroides_B 93.32 9.11 21.62 +maxbin_output.019_sub d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Tannerellaceae;g__Parabacteroides;s__Parabacteroides 92.63 4.46 22.22 +maxbin_output.025 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Barnesiellaceae;g__Barnesiella;s__Barnesiella 99.25 0.38 0.00 +mmi_sr_metaspades_hybrid.metabat.5 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Coprobacteraceae;g__Coprobacter;s__Coprobacter 91.32 0.57 0.00 +mmi_sr_metaspades_hybrid.metabat.29 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas 98.03 1.41 40.00 +mmi_sr_metaspades_hybrid.metabat.71 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Odoribacter;s__Odoribacter 90.39 2.69 0.00 +maxbin_output.006 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 99.52 0.48 0.00 +mmi_sr_metaspades_hybrid.metabat.49_sub d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes;s__Alistipes 98.04 1.44 66.67 +mmi_sr_metaspades_hybrid.metabat.12 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Tidjanibacter;s__Tidjanibacter 83.65 0.00 0.00 +mmi_sr_metaspades_hybrid.metabat.1 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes_A;s__Alistipes_A 80.91 1.12 0.00 +mmi_sr_metaspades_hybrid.metabat.26 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Parasutterella;s__Parasutterella 81.43 1.25 40.00 +mmi_sr_metaspades_hybrid.metabat.9 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Acutalibacteraceae;g__;s__ 93.96 0.67 0.00 +mmi_sr_metaspades_hybrid.metabat.47 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__CAG-170;s__ 89.62 4.36 25.00 +mmi_sr_metaspades_hybrid.metabat.56 d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Oscillospiraceae;g__CAG-103;s__ 85.51 0.75 66.67 diff --git a/figures/data/overlap_sizes.txt b/figures/data/overlap_sizes.txt new file mode 100644 index 0000000000000000000000000000000000000000..d86ff8f9080a3c3e3be2de21895a35830f0f585c --- /dev/null +++ b/figures/data/overlap_sizes.txt @@ -0,0 +1,11 @@ +165287 flye +250742 megahit +365813 metaspades +341322 metaspades_hybrid +109382 flye_megahit +112740 flye_metaspades_hybrid +110600 flye_metaspades +196581 megahit_metaspades_hybrid +214396 megahit_metaspades +326458 metaspades_metaspades_hybrid +101535 flye_megahit_metaspades_hybrid_metaspades diff --git a/figures/data/total.txt b/figures/data/total.txt new file mode 100644 index 0000000000000000000000000000000000000000..db35f6a2481179906c3034dff9cbaaae259e0aad --- /dev/null +++ b/figures/data/total.txt @@ -0,0 +1,4 @@ +flye 165287 +megahit 250742 +metaspades 365813 +metaspades_hybrid 341322 diff --git a/figures/envs/figures.yml b/figures/envs/figures.yml new file mode 100644 index 0000000000000000000000000000000000000000..49f4d878a5b62a68c050f498ee79c2022c67c107 --- /dev/null +++ b/figures/envs/figures.yml @@ -0,0 +1,143 @@ +name: ONT_pilot_fig +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_llvm + - aioeasywebdav=2.4.0=py36_1000 + - aiohttp=3.6.2=py36h516909a_0 + - appdirs=1.4.3=py_1 + - async-timeout=3.0.1=py_1000 + - attrs=19.3.0=py_0 + - bcrypt=3.1.7=py36h8c4c3a4_1 + - boto3=1.13.18=pyh9f0ad1d_0 + - botocore=1.16.18=pyh9f0ad1d_0 + - brotlipy=0.7.0=py36h8c4c3a4_1000 + - ca-certificates=2020.4.5.1=hecc5488_0 + - cachetools=4.1.0=py_1 + - cairo=1.16.0=h3fc0475_1004 + - certifi=2020.4.5.1=py36h9f0ad1d_0 + - cffi=1.14.0=py36hd463f26_0 + - chardet=3.0.4=py36h9f0ad1d_1006 + - configargparse=1.2.3=pyh9f0ad1d_0 + - cryptography=2.9.2=py36h45558ae_0 + - datrie=0.8.2=py36h8c4c3a4_0 + - decorator=4.4.2=py_0 + - docutils=0.15.2=py36_0 + - dropbox=9.4.0=py_0 + - expat=2.2.9=he1b5a44_2 + - filechunkio=1.8=py_2 + - fontconfig=2.13.1=h1056068_1002 + - freetype=2.10.2=he06d7ca_0 + - ftputil=3.4=py_0 + - gettext=0.19.8.1=hc5be6a0_1002 + - gitdb=4.0.5=py_0 + - gitpython=3.1.2=py_0 + - glib=2.64.3=h6f030ca_0 + - google-api-core=1.17.0=py36h9f0ad1d_0 + - google-auth=1.14.3=pyh9f0ad1d_0 + - google-cloud-core=1.3.0=py_0 + - google-cloud-storage=1.28.1=pyh9f0ad1d_0 + - google-resumable-media=0.5.0=py_1 + - googleapis-common-protos=1.51.0=py36h9f0ad1d_2 + - graphite2=1.3.13=he1b5a44_1001 + - graphviz=2.38.0=hf68f40c_1011 + - harfbuzz=2.4.0=hee91db6_4 + - icu=67.1=he1b5a44_0 + - idna=2.9=py_1 + - idna_ssl=1.1.0=py36_1000 + - importlib-metadata=1.6.0=py36h9f0ad1d_0 + - importlib_metadata=1.6.0=0 + - jinja2=2.11.2=pyh9f0ad1d_0 + - jmespath=0.10.0=pyh9f0ad1d_0 + - jpeg=9c=h14c3975_1001 + - jsonschema=3.2.0=py36h9f0ad1d_1 + - ld_impl_linux-64=2.34=h53a641e_4 + - libblas=3.8.0=16_openblas + - libcblas=3.8.0=16_openblas + - libffi=3.2.1=he1b5a44_1007 + - libgcc-ng=9.2.0=h24d8f2e_2 + - libgfortran-ng=7.5.0=hdf63c60_6 + - libiconv=1.15=h516909a_1006 + - liblapack=3.8.0=16_openblas + - libopenblas=0.3.9=h5ec1e0e_0 + - libpng=1.6.37=hed695b0_1 + - libprotobuf=3.12.1=h8b12597_0 + - libstdcxx-ng=9.2.0=hdf63c60_2 + - libtiff=4.1.0=hc7e4089_6 + - libtool=2.4.6=h14c3975_1002 + - libuuid=2.32.1=h14c3975_1000 + - libwebp-base=1.1.0=h516909a_3 + - libxcb=1.13=h14c3975_1002 + - libxml2=2.9.10=h72b56ed_1 + - llvm-openmp=10.0.0=hc9558a2_0 + - lz4-c=1.9.2=he1b5a44_1 + - markupsafe=1.1.1=py36h8c4c3a4_1 + - multidict=4.7.5=py36h8c4c3a4_1 + - ncurses=6.1=hf484d3e_1002 + - networkx=2.4=py_1 + - numpy=1.18.4=py36h7314795_0 + - openssl=1.1.1g=h516909a_0 + - pandas=1.0.3=py36h830a2c2_1 + - pango=1.40.14=he7ab937_1005 + - paramiko=2.7.1=py36_0 + - pcre=8.44=he1b5a44_0 + - pip=20.1.1=py_1 + - pixman=0.38.0=h516909a_1003 + - prettytable=0.7.2=py_3 + - protobuf=3.12.1=py36h831f99a_0 + - psutil=5.7.0=py36h8c4c3a4_1 + - pthread-stubs=0.4=h14c3975_1001 + - pyasn1=0.4.8=py_0 + - pyasn1-modules=0.2.7=py_0 + - pycparser=2.20=py_0 + - pygraphviz=1.5=py36h8c4c3a4_1002 + - pynacl=1.3.0=py36h516909a_1001 + - pyopenssl=19.1.0=py_1 + - pyrsistent=0.16.0=py36h8c4c3a4_0 + - pysftp=0.2.9=py_1 + - pysocks=1.7.1=py36h9f0ad1d_1 + - python=3.6.10=h8356626_1011_cpython + - python-dateutil=2.8.1=py_0 + - python-irodsclient=0.8.2=py_0 + - python_abi=3.6=1_cp36m + - pytz=2020.1=pyh9f0ad1d_0 + - pyyaml=5.3.1=py36h8c4c3a4_0 + - ratelimiter=1.2.0=py36_1000 + - readline=8.0=hf8c457e_0 + - requests=2.23.0=pyh8c360ce_2 + - rsa=4.0=py_0 + - s3transfer=0.3.3=py36h9f0ad1d_1 + - setuptools=46.4.0=py36h9f0ad1d_0 + - six=1.15.0=pyh9f0ad1d_0 + - smmap=3.0.4=pyh9f0ad1d_0 + - snakemake=5.3.0=py36_1 + - snakemake-minimal=5.3.0=py36_1 + - sqlite=3.30.1=hcee41ef_0 + - tk=8.6.10=hed695b0_0 + - typing_extensions=3.7.4.2=py_0 + - urllib3=1.25.9=py_0 + - wheel=0.34.2=py_1 + - wrapt=1.12.1=py36h8c4c3a4_1 + - xmlrunner=1.7.7=py_0 + - xorg-kbproto=1.0.7=h14c3975_1002 + - xorg-libice=1.0.10=h516909a_0 + - xorg-libsm=1.2.3=h84519dc_1000 + - xorg-libx11=1.6.9=h516909a_0 + - xorg-libxau=1.0.9=h14c3975_0 + - xorg-libxdmcp=1.1.3=h516909a_0 + - xorg-libxext=1.3.4=h516909a_0 + - xorg-libxpm=3.5.13=h516909a_0 + - xorg-libxrender=0.9.10=h516909a_1002 + - xorg-libxt=1.1.5=h516909a_1003 + - xorg-renderproto=0.11.1=h14c3975_1002 + - xorg-xextproto=7.3.0=h14c3975_1002 + - xorg-xproto=7.0.31=h14c3975_1007 + - xz=5.2.5=h516909a_0 + - yaml=0.2.4=h516909a_0 + - yarl=1.3.0=py36h516909a_1000 + - zipp=3.1.0=py_0 + - zlib=1.2.11=h516909a_1006 + - zstd=1.4.4=h6597ccf_3 diff --git a/figures/envs/r.yml b/figures/envs/r.yml new file mode 100644 index 0000000000000000000000000000000000000000..6c984b52da8329cb129f9a74ba44b815d3456cb9 --- /dev/null +++ b/figures/envs/r.yml @@ -0,0 +1,196 @@ +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_llvm + - _r-mutex=1.0.1=anacondar_1 + - binutils_impl_linux-64=2.34=h53a641e_4 + - binutils_linux-64=2.34=hc952b39_20 + - bwidget=1.9.14=0 + - bzip2=1.0.8=h516909a_2 + - ca-certificates=2020.4.5.1=hecc5488_0 + - cairo=1.16.0=hcf35c78_1003 + - certifi=2020.4.5.1=py38h32f6830_0 + - curl=7.69.1=h33f0ec9_0 + - fontconfig=2.13.1=h86ecdb6_1001 + - freetype=2.10.2=he06d7ca_0 + - fribidi=1.0.9=h516909a_0 + - gcc_impl_linux-64=7.5.0=hd420e75_6 + - gcc_linux-64=7.5.0=h09487f9_20 + - gettext=0.19.8.1=hc5be6a0_1002 + - gfortran_impl_linux-64=7.5.0=hdf63c60_6 + - gfortran_linux-64=7.5.0=h09487f9_20 + - glib=2.64.3=h6f030ca_0 + - graphite2=1.3.13=he1b5a44_1001 + - gsl=2.6=h294904e_0 + - gxx_impl_linux-64=7.5.0=hdf63c60_6 + - gxx_linux-64=7.5.0=h09487f9_20 + - harfbuzz=2.4.0=h9f30f68_3 + - icu=64.2=he1b5a44_1 + - jpeg=9c=h14c3975_1001 + - krb5=1.17.1=h2fd8d38_0 + - ld_impl_linux-64=2.34=h53a641e_4 + - libblas=3.8.0=16_openblas + - libcblas=3.8.0=16_openblas + - libcurl=7.69.1=hf7181ac_0 + - libedit=3.1.20170329=hf8c457e_1001 + - libffi=3.2.1=he1b5a44_1007 + - libgcc-ng=9.2.0=h24d8f2e_2 + - libgfortran-ng=7.5.0=hdf63c60_6 + - libgomp=9.2.0=h24d8f2e_2 + - libiconv=1.15=h516909a_1006 + - liblapack=3.8.0=16_openblas + - libopenblas=0.3.9=h5ec1e0e_0 + - libpng=1.6.37=hed695b0_1 + - libssh2=1.9.0=hab1572f_2 + - libstdcxx-ng=9.2.0=hdf63c60_2 + - libtiff=4.1.0=hc7e4089_6 + - libuuid=2.32.1=h14c3975_1000 + - libwebp-base=1.1.0=h516909a_3 + - libxcb=1.13=h14c3975_1002 + - libxml2=2.9.10=hee79883_0 + - llvm-openmp=10.0.0=hc9558a2_0 + - lz4-c=1.9.2=he1b5a44_1 + - make=4.3=h516909a_0 + - ncurses=6.1=hf484d3e_1002 + - openssl=1.1.1g=h516909a_0 + - pango=1.42.4=h7062337_4 + - pcre=8.44=he1b5a44_0 + - pcre2=10.34=h2f06484_0 + - pip=20.1.1=py_1 + - pixman=0.38.0=h516909a_1003 + - pthread-stubs=0.4=h14c3975_1001 + - python=3.8.2=he5300dc_7_cpython + - python_abi=3.8=1_cp38 + - r-argparse=2.0.1=r40h6115d3f_2 + - r-askpass=1.1=r40hcdcec82_2 + - r-assertthat=0.2.1=r40h6115d3f_2 + - r-backports=1.1.7=r40hcdcec82_0 + - r-base=4.0.0=hdca8982_3 + - r-brew=1.0_6=r40h6115d3f_1003 + - r-callr=3.4.3=r40h6115d3f_1 + - r-cli=2.0.2=r40h6115d3f_1 + - r-clipr=0.7.0=r40h6115d3f_1 + - r-colorspace=1.4_1=r40hcdcec82_2 + - r-commonmark=1.7=r40hcdcec82_1002 + - r-covr=3.5.0=r40h0357c0b_1 + - r-crayon=1.3.4=r40h6115d3f_1003 + - r-crosstalk=1.1.0.1=r40h6115d3f_1 + - r-curl=4.3=r40hcdcec82_1 + - r-desc=1.2.0=r40h6115d3f_1003 + - r-devtools=2.3.0=r40h6115d3f_1 + - r-digest=0.6.25=r40h0357c0b_2 + - r-dplyr=0.8.5=r40h0357c0b_1 + - r-dt=0.13=r40h6115d3f_1 + - r-ellipsis=0.3.1=r40hcdcec82_0 + - r-evaluate=0.14=r40h6115d3f_2 + - r-fansi=0.4.1=r40hcdcec82_1 + - r-farver=2.0.3=r40h0357c0b_1 + - r-findpython=1.0.5=r40h6115d3f_2 + - r-fs=1.4.1=r40h0357c0b_1 + - r-getopt=1.20.3=r40_2 + - r-ggfortify=0.4.10=r40h6115d3f_1 + - r-ggplot2=3.3.0=r40h6115d3f_1 + - r-ggrepel=0.8.2=r40h0357c0b_1 + - r-ggsci=2.9=r40h6115d3f_1003 + - r-ggsignif=0.6.0=r40h6115d3f_1 + - r-gh=1.1.0=r40h6115d3f_1 + - r-git2r=0.27.1=r40h7253d3a_0 + - r-glue=1.4.1=r40hcdcec82_0 + - r-gridextra=2.3=r40h6115d3f_1003 + - r-gtable=0.3.0=r40h6115d3f_3 + - r-highr=0.8=r40h6115d3f_2 + - r-htmltools=0.4.0=r40h0357c0b_1 + - r-htmlwidgets=1.5.1=r40h6115d3f_1 + - r-httr=1.4.1=r40h6115d3f_2 + - r-ini=0.3.1=r40h6115d3f_1003 + - r-isoband=0.2.1=r40h0357c0b_1 + - r-jsonlite=1.6.1=r40hcdcec82_1 + - r-knitr=1.28=r40h6115d3f_1 + - r-labeling=0.3=r40h6115d3f_1003 + - r-later=1.0.0=r40h0357c0b_1 + - r-lattice=0.20_41=r40hcdcec82_2 + - r-lazyeval=0.2.2=r40hcdcec82_2 + - r-lifecycle=0.2.0=r40h6115d3f_1 + - r-magrittr=1.5=r40h6115d3f_1003 + - r-markdown=1.1=r40hcdcec82_1 + - r-mass=7.3_51.6=r40hcdcec82_2 + - r-matrix=1.2_18=r40h7fa42b6_3 + - r-memoise=1.1.0=r40h6115d3f_1004 + - r-mgcv=1.8_31=r40h7fa42b6_1 + - r-mime=0.9=r40hcdcec82_1 + - r-munsell=0.5.0=r40h6115d3f_1003 + - r-nlme=3.1_148=r40h9bbef5b_0 + - r-openssl=1.4.1=r40he5c4762_1 + - r-pheatmap=1.0.12=r40h6115d3f_2 + - r-pillar=1.4.4=r40h6115d3f_0 + - r-pkgbuild=1.0.8=r40h6115d3f_0 + - r-pkgconfig=2.0.3=r40h6115d3f_1 + - r-pkgload=1.0.2=r40h0357c0b_1002 + - r-plogr=0.2.0=r40h6115d3f_1003 + - r-plyr=1.8.6=r40h0357c0b_1 + - r-praise=1.0.0=r40h6115d3f_1004 + - r-prettyunits=1.1.1=r40h6115d3f_1 + - r-processx=3.4.2=r40hcdcec82_1 + - r-promises=1.1.0=r40h0357c0b_1 + - r-ps=1.3.3=r40hcdcec82_0 + - r-purrr=0.3.4=r40hcdcec82_1 + - r-r6=2.4.1=r40h6115d3f_1 + - r-rcmdcheck=1.3.3=r40h6115d3f_3 + - r-rcolorbrewer=1.1_2=r40h6115d3f_1003 + - r-rcpp=1.0.4.6=r40h0357c0b_1 + - r-rematch2=2.1.2=r40h6115d3f_1 + - r-remotes=2.1.1=r40h6115d3f_1 + - r-reshape2=1.4.4=r40h0357c0b_1 + - r-rex=1.2.0=r40h6115d3f_1 + - r-rlang=0.4.6=r40hcdcec82_0 + - r-roxygen2=7.1.0=r40h0357c0b_1 + - r-rprojroot=1.3_2=r40h6115d3f_1003 + - r-rstudioapi=0.11=r40h6115d3f_1 + - r-rversions=2.0.2=r40h6115d3f_0 + - r-scales=1.1.1=r40h6115d3f_0 + - r-sessioninfo=1.1.1=r40h6115d3f_1002 + - r-stringi=1.4.6=r40h0e574ca_2 + - r-stringr=1.4.0=r40h6115d3f_2 + - r-sys=3.3=r40hcdcec82_1 + - r-testit=0.11=r40h6115d3f_1 + - r-testthat=2.3.2=r40h0357c0b_1 + - r-tibble=3.0.1=r40hcdcec82_1 + - r-tidyr=1.1.0=r40h0357c0b_0 + - r-tidyselect=1.1.0=r40h6115d3f_0 + - r-upsetr=1.4.0=r40h6115d3f_2 + - r-usethis=1.6.1=r40h6115d3f_1 + - r-utf8=1.1.4=r40hcdcec82_1003 + - r-vctrs=0.3.0=r40hcdcec82_0 + - r-viridis=0.5.1=r40h6115d3f_1004 + - r-viridislite=0.3.0=r40h6115d3f_1003 + - r-whisker=0.4=r40h6115d3f_1 + - r-withr=2.2.0=r40h6115d3f_1 + - r-xfun=0.14=r40h6115d3f_0 + - r-xml2=1.3.2=r40h0357c0b_1 + - r-xopen=1.0.0=r40h6115d3f_1003 + - r-yaml=2.2.1=r40hcdcec82_1 + - r-zeallot=0.1.0=r40h6115d3f_1002 + - readline=8.0=hf8c457e_0 + - sed=4.7=h1bed415_1000 + - setuptools=46.4.0=py38h32f6830_0 + - sqlite=3.30.1=hcee41ef_0 + - sysroot_linux-64=2.12=h3a0023d_3 + - tk=8.6.10=hed695b0_0 + - tktable=2.10=h555a92e_3 + - wheel=0.34.2=py_1 + - xorg-kbproto=1.0.7=h14c3975_1002 + - xorg-libice=1.0.10=h516909a_0 + - xorg-libsm=1.2.3=h84519dc_1000 + - xorg-libx11=1.6.9=h516909a_0 + - xorg-libxau=1.0.9=h14c3975_0 + - xorg-libxdmcp=1.1.3=h516909a_0 + - xorg-libxext=1.3.4=h516909a_0 + - xorg-libxrender=0.9.10=h516909a_1002 + - xorg-renderproto=0.11.1=h14c3975_1002 + - xorg-xextproto=7.3.0=h14c3975_1002 + - xorg-xproto=7.0.31=h14c3975_1007 + - xz=5.2.5=h516909a_0 + - zlib=1.2.11=h516909a_1006 + - zstd=1.4.4=h6597ccf_3 diff --git a/figures/figures.smk b/figures/figures.smk new file mode 100644 index 0000000000000000000000000000000000000000..9440dcd3839ab996308a56d793d582277ee60d62 --- /dev/null +++ b/figures/figures.smk @@ -0,0 +1,354 @@ +################################################## +# Config +################################################## +configfile: "figures.yml" + +################################################## +# Python modules +################################################## +import os + +################################################## +# TARGETS +################################################## +FIG_MMSEQ_UPSETR = os.path.join(config["output"], config["fig_mmseq_upsetr"]["output"]) +FIG_QUAST = os.path.join(config["output"], config["fig_quast"]["output"]) +FIG_PARTIAL_GENES = os.path.join(config["output"], config["fig_partial_genes"]["output"]) +FIG_NANOSTATS = os.path.join(config["output"], config["fig_nanostats"]["output"]) +FIG_CRISPR = os.path.join(config["output"], config["fig_crispr"]["output"]) +FIG_PLASFLOW = os.path.join(config["output"], config["fig_plasflow"]["output"]) +FIG_RGI = os.path.join(config["output"], config["fig_rgi"]["output"]) + +################################################## +# RULES +################################################## +rule all: + input: + FIG_MMSEQ_UPSETR, + FIG_QUAST, + FIG_PARTIAL_GENES, + FIG_NANOSTATS, + FIG_CRISPR, + FIG_PLASFLOW, + FIG_RGI, + +rule fig_mmseq_upsetr: + input: + overlap_sizes=config["fig_mmseq_upsetr"]["input"]["overlap_sizes"] + output: + pdf=FIG_MMSEQ_UPSETR + log: + FIG_MMSEQ_UPSETR + ".log" + params: + utils=config["utils"], + width=config["fig_mmseq_upsetr"]["width"], + height=config["fig_mmseq_upsetr"]["height"] + conda: + "envs/r.yml" + script: + config["fig_mmseq_upsetr"]["script"] + +rule fig_quast: + input: + stats=config["fig_quast"]["input"]["stats"] + output: + pdf=FIG_QUAST + log: + FIG_QUAST + ".log" + params: + utils=config["utils"], + width=config["fig_quast"]["width"], + height=config["fig_quast"]["height"] + conda: + "envs/r.yml" + script: + config["fig_quast"]["script"] + +rule fig_partial_genes: + input: + counts=config["fig_partial_genes"]["input"]["counts"] + output: + pdf=FIG_PARTIAL_GENES + log: + FIG_PARTIAL_GENES + ".log" + params: + utils=config["utils"], + width=config["fig_partial_genes"]["width"], + height=config["fig_partial_genes"]["height"] + conda: + "envs/r.yml" + script: + config["fig_partial_genes"]["script"] + +rule fig_nanostats_data: + input: + expand( + os.path.join(config["data_path"], "{meth_flag}/qc/lr/{sel_flag}/no_barcode/no_barcodeNanoStats.txt"), + meth_flag=["results", "non_methylation_aware_results"], + sel_flag=["S1_SizeSelected", "S3_Gtube", "merged"] + ) + output: + config["fig_nanostats"]["input"]["stats"] + run: + import re + import pandas + summary = dict() + for ifile_path in input: + summary[ifile_path] = dict() + # info from file path + summary[ifile_path]["meth_flag"] = "M" if not re.search("non_methylation_aware", ifile_path) else "N" + sel_flag = None + if re.search("S1_SizeSelected", ifile_path): + sel_flag = "size_selected" + elif re.search("S3_Gtube", ifile_path): + sel_flag = "gtube" + elif re.search("merged", ifile_path): + sel_flag = "merged" + assert sel_flag is not None + summary[ifile_path]["sel_flag"] = sel_flag + # file content + flag = None + with open(ifile_path, "r") as ifile: + for line in ifile: + if re.match("General summary", line): + flag = "general" + elif re.match("Number, percentage and megabases of reads above quality cutoffs", line): + flag = "number" + elif re.match("Top 5 highest mean basecall quality scores and their read lengths", line): + flag = "top5QS" + elif re.match("Top 5 longest reads and their mean basecall quality score", line): + flag = "top5LR" + else: + # general summary + if flag == "general": + line = re.sub(":\s+", "\t", line) + k, v = line.split("\t") + summary[ifile_path][k] = float(re.sub(",", "", v)) + # rest: skip + else: + continue + # dict to DataFrame + summary = pandas.DataFrame.from_dict(summary, orient="index") + # wrote to file + summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False) + +rule fig_nanostats: + input: + stats=config["fig_nanostats"]["input"]["stats"] + output: + pdf=FIG_NANOSTATS + log: + FIG_NANOSTATS + ".log" + params: + utils=config["utils"], + width=config["fig_nanostats"]["width"], + height=config["fig_nanostats"]["height"] + conda: + "envs/r.yml" + script: + config["fig_nanostats"]["script"] + +rule fig_crispr_data: + input: + expand( + os.path.join(config["data_path"], "results/analysis/crispr/minced/{asm_tool}.txt"), + asm_tool=config["assemblers"] + ), + expand( + os.path.join(config["data_path"], "results/analysis/crispr/casc/{asm_tool}_casc_output/{asm_tool}.results.txt"), + asm_tool=config["assemblers"] + ) + output: + config["fig_crispr"]["input"]["stats"] + run: + import os + import re + import pandas + from src.utils import parse_minced_report, parse_casc_report + summary = [] + for ifile_path in input: + ifile_df = None + # info from file path + crispr_tool = None + if re.search("minced", ifile_path): + crispr_tool = "minced" + elif re.search("casc", ifile_path): + crispr_tool = "casc" + asm_tool = os.path.basename(ifile_path).split(".")[0] + # file content + if crispr_tool == "minced": + ifile_df = pandas.DataFrame(parse_minced_report(ifile_path)) + elif crispr_tool == "casc": + ifile_df = parse_casc_report(ifile_path) + ifile_df = ifile_df.assign(crispr_tool=crispr_tool) + ifile_df = ifile_df.assign(asm_tool=asm_tool) + summary.append(ifile_df) + # concat + summary = pandas.concat(objs=summary, axis="index") + # wrote to file + summary.to_csv(output[0], sep="\t", header=True, index=False, index_label=False) + +rule fig_crispr: + input: + stats=config["fig_crispr"]["input"]["stats"] + output: + pdf=FIG_CRISPR + log: + FIG_CRISPR + ".log" + params: + utils=config["utils"], + width=config["fig_crispr"]["width"], + height=config["fig_crispr"]["height"] + conda: + "envs/r.yml" + script: + config["fig_crispr"]["script"] + +rule fig_plasflow_data: + input: + expand( + os.path.join(config["data_path"], "results/analysis/plasflow/{asm_tool}.tsv"), + asm_tool=config["assemblers"] + ) + output: + config["fig_plasflow"]["input"]["stats"] + run: + import pandas + + def sum_up(df, colname, label): + df_sum = df.groupby(colname)['contig_length'].agg(['sum','count']) + df_sum["count_pct"] = 100 * df_sum["count"] / sum(df_sum["count"]) + df_sum["sum_pct"] = 100 * df_sum["sum"] / sum(df_sum["sum"]) + df_sum["type"] = label + return df_sum + + # read in and summarize + summary = [] + for ifile in input: + idf = pandas.read_csv(ifile, sep="\t", header=0, usecols=["contig_length", "label"]) + # extract class label, i.e. w/o taxon + idf = idf.assign(classlabel=idf.label.apply(lambda x: x.split(".")[0])) + # sum up + idf_labelsum = sum_up(idf, "label", "label") + idf_classsum = sum_up(idf, "classlabel", "class") + idf_sum = pandas.concat(objs=[idf_labelsum, idf_classsum], axis="index") + idf_sum["tool"] = os.path.splitext(os.path.basename(ifile))[0] + summary.append(idf_sum) + # concat and save + summary = pandas.concat(objs=summary, axis="index") + summary.to_csv(output[0], sep="\t", header=True, index=True, index_label="label") + +rule fig_plasflow: + input: + stats=config["fig_plasflow"]["input"]["stats"] + output: + pdf=FIG_PLASFLOW + log: + FIG_PLASFLOW + ".log" + params: + utils=config["utils"], + width=config["fig_plasflow"]["width"], + height=config["fig_plasflow"]["height"] + conda: + "envs/r.yml" + script: + config["fig_plasflow"]["script"] + +rule fig_rgi_data: + input: + expand( + os.path.join(config["data_path"], "results/analysis/rgi/{asm_tool}.txt"), + asm_tool=config["assemblers"] + ) + output: + config["fig_rgi"]["input"]["stats"] + run: + import pandas + import numpy + + summary_cols = ["Best_Hit_ARO", "ARO", "Drug Class", "Resistance Mechanism", "AMR Gene Family"] + summary_types = ["all", "strict", "nudged"] + + def count_hits(df, colname, tool_label=None): + """ + Sum up RGI results: count hits by grouping them + """ + df_sum = df.groupby(colname)[colname].agg(["count"]) + df_sum["label"] = df_sum.index + if tool_label is not None: + df_sum.rename(columns={"count": tool_label}, inplace=True) + return df_sum + + def sumup_hits(df, summary_cols, tool_label): + """ + Count hits grouping them by each given column: all/strict/nudged + """ + assert set(df.Nudged) == {numpy.nan, True}, "Unexpected \"Nudged\" value in RGI report: {}".format(set(df.Nudged)) + summary = dict.fromkeys(summary_cols) + for summary_col in summary_cols: + summary[summary_col] = { + "all": count_hits(df, summary_col, tool_label), + "strict": count_hits(df[pandas.isnull(df.Nudged)], summary_col, tool_label), + "nudged": count_hits(df[pandas.notnull(df.Nudged)], summary_col, tool_label) + } + return summary + + def merge_summaries(dfs, summary_col, summary_type): + """ + Merge summaries (list of DataFrame obj.s) for given column and type + """ + summary = None + for df in dfs: + if summary is None: + summary = df[summary_col][summary_type].copy() + else: + summary = summary.merge( + right=df[summary_col][summary_type], + on="label", + how="outer", + sort=False + ) + summary["col"] = summary_col + summary["type"] = summary_type + return summary + + # read in and summarize + summaries = [] + for ifile in input: + # tool name from file name + ifile_tool = os.path.splitext(os.path.basename(ifile))[0] + # read in the report + summaries.append( + sumup_hits( + pandas.read_csv(ifile, sep="\t", header=0, usecols=["ORF_ID"] + summary_cols + ["Nudged"]), + summary_cols, + ifile_tool + ) + ) + # merge + merged_summaries = [] + for summary_col in summary_cols: + for summary_type in summary_types: + merged_summaries.append(merge_summaries(summaries, summary_col, summary_type)) + # concat + summary = pandas.concat(objs=merged_summaries, axis="index") + # reorder columns + summary_cols_ordered = ["label", "col", "type"] + [c for c in summary.columns if c not in ["label", "col", "type"]] + # save + summary[summary_cols_ordered].to_csv(output[0], sep="\t", header=True, index=False, index_label=False, na_rep=0) + +rule fig_rgi: + input: + stats=config["fig_rgi"]["input"]["stats"] + output: + pdf=FIG_RGI + log: + FIG_RGI + ".log" + params: + utils=config["utils"], + width=config["fig_rgi"]["width"], + height=config["fig_rgi"]["height"] + conda: + "envs/r.yml" + script: + config["fig_rgi"]["script"] diff --git a/figures/figures.yml b/figures/figures.yml new file mode 100644 index 0000000000000000000000000000000000000000..ee79a0879067981e8e1859c4c33919ae6be607f3 --- /dev/null +++ b/figures/figures.yml @@ -0,0 +1,62 @@ +# general +data_path: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/" +output: "output" # output directory +utils: "src/utils.R" # path to utils.R +assemblers: ["flye", "megahit", "metaspades", "metaspades_hybrid"] + +# figures +fig_mmseq_upsetr: + script: "src/fig_mmseq_upsetr.R" + input: + overlap_sizes: "data/overlap_sizes.txt" + output: "fig_mmseq_upsetr.pdf" + width: 7 + height: 5 + +fig_quast: + script: "src/fig_quast.R" + input: + stats: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/quast_methylation_v_non_comparison/quast_comparision_meth_v_non_results/report.tsv" + output: "fig_quast.pdf" + width: 7 + height: 10 + +fig_partial_genes: + script: "src/fig_partial_genes.R" + input: + counts: "data/2019_GDB_protein_partial_genes.txt" + output: "fig_partial_genes.pdf" + width: 7 + height: 5 + +fig_nanostats: + script: "src/fig_nanostats.R" + input: + stats: "data/nanostats_summary.tsv" + output: "fig_nanostats.pdf" + width: 7 + height: 9 + +fig_crispr: + script: "src/fig_crispr.R" + input: + stats: "data/crispr_summary.tsv" + output: "fig_crispr.pdf" + width: 7 + height: 5 + +fig_plasflow: + script: "src/fig_plasflow.R" + input: + stats: "data/plasflow_summary.tsv" + output: "fig_plasflow.pdf" + width: 7 + height: 5 + +fig_rgi: + script: "src/fig_rgi.R" + input: + stats: "data/rgi_summary.tsv" + output: "fig_rgi.pdf" + width: 10 + height: 7 \ No newline at end of file diff --git a/figures/src/PathoFact_AMR_Figures.R b/figures/src/PathoFact_AMR_Figures.R new file mode 100644 index 0000000000000000000000000000000000000000..56df94a5a0eacc69ca952bf27956714276e061fa --- /dev/null +++ b/figures/src/PathoFact_AMR_Figures.R @@ -0,0 +1,96 @@ +# To generate figures for PathoFact output +library(tidyverse) +library(ggplot2) +################# +# Preprocessing # +################# +ONT <- read.csv("~/Documents/R_data/nanopore_ONT/pathofact_AMR/PathoFact_ONT.csv", row.names=1) +# Toxin prediction +Toxin_confidence_level <- c("-","1","2") +Toxin_prediction <- c("non-pathogenic","secreted","non-secreted") +Toxin <- data.frame(Toxin_confidence_level,Toxin_prediction) +Toxin_ONT <- ONT %>% group_by(Toxin_confidence_level, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) +Toxin_ONT <- left_join(Toxin_ONT, Toxin, by="Toxin_confidence_level") +# Virulence prediction +Virulence_confidence_level <- c("-","1","2","3","4") +virulence_prediction <- c("non-pathogenic","secreted","non-secreted","non-pathogenic","non-pathogenic") +Virulence <- data.frame(Virulence_confidence_level,virulence_prediction) +Virulence_ONT <- ONT %>% group_by(Virulence_confidence_level, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) +Virulence_ONT <- left_join(Virulence_ONT, Virulence) +Virulence_ONT <- Virulence_ONT %>% group_by(virulence_prediction, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) +# AMR prediction +AMR_ONT <- ONT %>% select(10,15,16) +AMR_ONT$AMR <- ifelse(AMR_ONT$AMR_Category == "-", "-", "AMR") +## Overall AMR +AMR_ONT_overall <- AMR_ONT %>% group_by(AMR, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) +## AMR category +AMR_ONT_category <- AMR_ONT %>% group_by(AMR_Category, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) + +ONT <- read.csv("~/Documents/R_data/nanopore_ONT/pathofact_AMR/PathoFact_ONT.csv", row.names=1) +# Toxin prediction +Toxin_confidence_level <- c("-","1","2") +Toxin_prediction <- c("non-pathogenic","secreted","non-secreted") +Toxin <- data.frame(Toxin_confidence_level,Toxin_prediction) +Toxin_ONT <- ONT %>% group_by(Toxin_confidence_level, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) +Toxin_ONT <- left_join(Toxin_ONT, Toxin, by="Toxin_confidence_level") +# Virulence prediction +Virulence_confidence_level <- c("-","1","2","3","4") +virulence_prediction <- c("non-pathogenic","secreted","non-secreted","non-pathogenic","non-pathogenic") +Virulence <- data.frame(Virulence_confidence_level,virulence_prediction) +Virulence_ONT <- ONT %>% group_by(Virulence_confidence_level, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) +Virulence_ONT <- left_join(Virulence_ONT, Virulence) +Virulence_ONT <- Virulence_ONT %>% group_by(virulence_prediction, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) +# AMR prediction +AMR_ONT <- ONT %>% select(10,15,16) +AMR_ONT$AMR <- ifelse(AMR_ONT$AMR_Category == "-", "-", "AMR") +## Overall AMR +AMR_ONT_overall <- AMR_ONT %>% group_by(AMR, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) +## AMR category +AMR_ONT_category <- AMR_ONT %>% group_by(AMR_Category, Sample) %>% summarise(RNum_Gi=sum(RNum_Gi)) + +########### +# Figures # +########### +#FIGURES +#set theme +theme_set(theme_bw() + + theme(strip.background = element_rect(colour = "black", fill = "white"), strip.text = element_text(size=14, family="Arial")) + + theme(axis.text.x = element_text(angle = 90, size = 12, family = "Arial", hjust = 0.95), + axis.text.y = element_text(size = 14, family = "Arial"), + axis.title.y = element_text(size = 12, family = "Arial"), + axis.title.x=element_text(size = 14, face = "bold", family = "Arial"), + axis.ticks.x=element_blank()) + + theme(plot.title = element_text(hjust = 0.5)) + + theme(legend.text = element_text(size = 9, family = "Arial"), + legend.title = element_text(size = 9, family = "Arial", face = "bold"), legend.direction = "vertical", legend.box = "vertical")) +# Toxin Figure +Toxin_ONT_figure <- Toxin_ONT %>% filter(Toxin_prediction != "non-pathogenic") +Toxin_plot <- Toxin_ONT_figure %>% ggplot(aes(x=Sample, y=RNum_Gi, fill = Sample)) + + geom_bar(stat = "identity") + + facet_grid(~ Toxin_prediction) + + xlab(element_blank()) + + ylab("relative abundance (RNum_Gi)") + + ggtitle("Toxin") + + theme(legend.position = "none") # out comment for legend +# Virulence Figure +Virulence_ONT_figure <- Virulence_ONT %>% filter(virulence_prediction != "non-pathogenic") +Virulence_plot <- Virulence_ONT_figure %>% ggplot(aes(x=Sample, y=RNum_Gi, fill = Sample)) + + geom_bar(stat = "identity") + + facet_grid(~ virulence_prediction) + + xlab(element_blank()) + + ylab("relative abundance (RNum_Gi)") + + ggtitle("Virulence") + + theme(legend.position = "none") +# AMR overall Figure +AMR_ONT_overall_Figure <- AMR_ONT_overall %>% filter(AMR != "-") +AMR_plot <- AMR_ONT_overall_Figure %>% ggplot(aes(x=Sample, y=RNum_Gi, fill = Sample)) + + geom_bar(stat = "identity") + + facet_grid(~ AMR) + + xlab(element_blank()) + + ylab("relative abundance (RNum_Gi)") + + ggtitle("AMR") + +# Combine figures +library(cowplot) +plot_grid(Virulence_plot, Toxin_plot, AMR_plot, nrow = 1, labels = "auto") + diff --git a/figures/src/fig_crispr.R b/figures/src/fig_crispr.R new file mode 100644 index 0000000000000000000000000000000000000000..2085fac6c21a4ee0718ae717510699d66d61ac61 --- /dev/null +++ b/figures/src/fig_crispr.R @@ -0,0 +1,115 @@ +#!/usr/bin/Rscript + +## LOG FILE +sink(file=file(snakemake@log[[1]], open="wt"), type="message") + +## NOTE +# Plot CRISPR statistics + +## IMPORT +suppressMessages(library(testit)) +# suppressMessages(library(UpSetR)) +suppressMessages(library(ggplot2)) +# custom +source(snakemake@params$utils) + +## DATA +stats <- read.csv( + file=snakemake@input$stats, + sep="\t", + header=TRUE, + stringsAsFactors=FALSE, + check.names=FALSE +) +# change names +stats$crispr_tool <- CRISPR_TOOL_NAMES[stats$crispr_tool] +stats$asm_tool <- ASM_TOOL_NAMES[stats$asm_tool] +# aggregate: spacers per contig +# stats_aggr_sc <- aggregate( +# x=stats$spacers, +# by=list(seq_id=stats$seq_id, crispr_tool=stats$crispr_tool, asm_tool=stats$asm_tool), +# FUN=sum +# ) +# aggregate: spacers per assembly +stats_aggr_sa <- aggregate( + x=stats$spacers, + by=list(crispr_tool=stats$crispr_tool, asm_tool=stats$asm_tool), + FUN=sum +) +# aggregate: arrays per assembly +stats_aggr_aa <- aggregate( + x=stats$seq_id, + by=list(crispr_tool=stats$crispr_tool, asm_tool=stats$asm_tool), + FUN=length +) + +## PLOTS +# UpSetR plotting function +plot_upsetr <- function(df, asm_tool){ + asm_sets <- lapply( + CRISPR_TOOL_NAMES, + function(x){ unique(unlist(df[df$asm_tool==asm_tool & df$crispr_tool == x,"seq_id"])) } + ) + names(asm_sets) <- CRISPR_TOOL_NAMES[names(asm_sets)] + UpSetR::upset( + data=UpSetR::fromList(asm_sets), + order.by="degree", + decreasing=FALSE, + mainbar.y.label=sprintf("Contig intersection size (%s)", asm_tool), + sets.x.label="Contigs w/ CRISPR array(s)" + ) +} + +# Custom ggplot2 theme +my_theme <- + theme_bw() + + theme( + # legend + legend.title=element_blank(), + # grid + panel.grid=element_blank(), + # strip + strip.background=element_rect(fill="white"), + strip.text=element_text(size=12, color="black"), + # axes + axis.title=element_text(size=12, color="black"), + axis.text.y=element_text(size=9, color="black"), + axis.text.x=element_text(size=9, color="black", angle=90, vjust=0.5, hjust=1) + ) + +# List of ggplot plots +plots <- list() + +# Plot: Number of spacers per tool/assembly +plots$spacers_asm <- + ggplot(data=stats_aggr_sa, aes(x=crispr_tool, y=x, fill=asm_tool)) + + geom_col(position="dodge") + + scale_fill_manual(values=ASM_TOOL_COLORS$notmeth, guide="legend") + + labs( + x="", + y="Number of spacers" + ) + + my_theme + +# Plot: Number of arrays per tool/assembly +plots$arrays_asm <- + ggplot(data=stats_aggr_aa, aes(x=crispr_tool, y=x, fill=asm_tool)) + + geom_col(position="dodge") + + scale_fill_manual(values=ASM_TOOL_COLORS$notmeth, guide="legend") + + labs( + x="", + y="Number of CRISPR arrays" + ) + + my_theme + +# Plot: UpSetR (+ PDF) +# NOTE: +# For an unknown reason creating and saving multiple plots using a for-loop or sapply does not work - the PDF is empty. +# Creating a PDF per plot in a for-loop or using sapply does not work either - each PDF is empty. +pdf(snakemake@output$pdf, width=snakemake@params$width, height=snakemake@params$height) +for(pp in plots){ print(pp) } +plot_upsetr(stats, "Flye") +plot_upsetr(stats, "MEGAHIT") +plot_upsetr(stats, "metaSPAdes") +plot_upsetr(stats, "metaSPAdes (H)") +dev.off() diff --git a/figures/src/fig_mmseq_upsetr.R b/figures/src/fig_mmseq_upsetr.R new file mode 100644 index 0000000000000000000000000000000000000000..43ad473a5bd4b3fa54026dc591da661b7ff2f115 --- /dev/null +++ b/figures/src/fig_mmseq_upsetr.R @@ -0,0 +1,53 @@ +#!/usr/bin/Rscript + +## LOG FILE +sink(file=file(snakemake@log[[1]], open="wt"), type="message") + +## NOTE +# UpSetR +# https://cran.r-project.org/web/packages/UpSetR/vignettes/basic.usage.html + +## IMPORT +# suppressMessages(library(testit)) +suppressMessages(library(UpSetR)) +# custom +source(snakemake@params$utils) + +## DATA +# overlap/intersection sizes +overlap_sizes <- read.csv( + file=snakemake@input$overlap_sizes, + sep=" ", + header=FALSE, + stringsAsFactors=FALSE +) +# check: all expected tools are there +# testit::assert(all(sapply(names(ASM_TOOL_NAMES), function(x){ x %in% overlap_sizes$V2 }))) +# process names +for(tname in names(ASM_TOOL_NAMES)){ + overlap_sizes$V2 <- sub(tname, ASM_TOOL_NAMES[tname], overlap_sizes$V2) +} +overlap_sizes$V2 <- gsub("_", "&", overlap_sizes$V2) +# create input for UpSetR +overlap_names <- overlap_sizes$V2 +overlap_sizes <- overlap_sizes$V1 +names(overlap_sizes) <- overlap_names + +## PLOT +pdf(snakemake@output$pdf, width=snakemake@params$width, height=snakemake@params$height, onefile=FALSE) +UpSetR::upset( + data=UpSetR::fromExpression(overlap_sizes), + # overlap order + order.by="degree", + decreasing=FALSE, + # colors + set.metadata=list( + data=data.frame( + sets=ASM_TOOL_NAMES_NOTMETH, + Tool=ASM_TOOL_NAMES_NOTMETH, + stringsAsFactors=FALSE + ), + plots=list(list(type="matrix_rows", column="Tool", colors=ASM_TOOL_COLORS$notmeth, alpha=0.7)) + ) +) +dev.off() diff --git a/figures/src/fig_nanostats.R b/figures/src/fig_nanostats.R new file mode 100644 index 0000000000000000000000000000000000000000..a76a9d5ac3176fe3ce6e2fb2683728d55330d715 --- /dev/null +++ b/figures/src/fig_nanostats.R @@ -0,0 +1,62 @@ +#!/usr/bin/Rscript + +## LOG FILE +sink(file=file(snakemake@log[[1]], open="wt"), type="message") + +## NOTE +# Plot NanoStats statistics + +## IMPORT +suppressMessages(library(testit)) +suppressMessages(library(ggplot2)) +suppressMessages(library(reshape2)) +# custom +source(snakemake@params$utils) + +## DATA +stats <- read.csv( + file=snakemake@input$stats, + sep="\t", + header=TRUE, + stringsAsFactors=FALSE, + check.names=FALSE +) +# change names +stats$meth_flag <- METHYLATION_NAMES[stats$meth_flag] +stats$sel_flag <- RSEL_TOOL_NAMES[stats$sel_flag] +# reshape +stats_melted <- reshape2::melt(stats, id.vars=c("meth_flag", "sel_flag")) +stats_melted$meth_flag <- factor(stats_melted$meth_flag, levels=METHYLATION_NAMES, ordered=TRUE) +stats_melted$sel_flag <- factor(stats_melted$sel_flag, levels=RSEL_TOOL_NAMES, ordered=TRUE) + +## PLOT +stats_p <- + ggplot(data=stats_melted, aes(x=sel_flag, y=value, fill=meth_flag)) + + geom_col(position="dodge") + + scale_fill_manual(values=METHYLATION_COLORS$new, guide="legend") + + facet_wrap(vars(variable), ncol=1, scales="free_y") + + labs( + # title="", + # subtitle="", + x="", + y="" + ) + + theme_bw() + + theme( + # legend + legend.title=element_blank(), + # grid + panel.grid=element_blank(), + # strip + strip.background=element_rect(fill="white"), + strip.text=element_text(size=12, color="black"), + # axes + axis.title=element_text(size=12, color="black"), + axis.text.y=element_text(size=9, color="black"), + axis.text.x=element_text(size=9, color="black", angle=90, vjust=0.5, hjust=1) + ) + +## PDF +pdf(snakemake@output$pdf, width=snakemake@params$width, height=snakemake@params$height) +print(stats_p) +dev.off() diff --git a/figures/src/fig_partial_genes.R b/figures/src/fig_partial_genes.R new file mode 100644 index 0000000000000000000000000000000000000000..dda766ae21dc4774be853c9c6b234cac16456d5f --- /dev/null +++ b/figures/src/fig_partial_genes.R @@ -0,0 +1,64 @@ +#!/usr/bin/Rscript + +## LOG FILE +sink(file=file(snakemake@log[[1]], open="wt"), type="message") + +## NOTE +# Plot protein/gene counts + +## IMPORT +suppressMessages(library(testit)) +suppressMessages(library(ggplot2)) +suppressMessages(library(reshape2)) +# custom +source(snakemake@params$utils) + +## DATA +counts <- read.csv( + file=snakemake@input$counts, + sep="\t", + header=TRUE, + stringsAsFactors=FALSE, + check.names=FALSE +) +# change tool names +counts$tool <- GENE_TOOL_NAMES[counts$tool] +colnames(counts) <- sapply(colnames(counts), function(x){ ifelse(x %in% names(ASM_TOOL_NAMES), ASM_TOOL_NAMES[x], x) }) +# reshape +counts_melted <- reshape2::melt(counts, id.vars="tool") +# Gene tool name - as ordered factor +counts_melted$tool <- factor(counts_melted$tool, levels=GENE_TOOL_NAMES, ordered=TRUE) +# Gene tool name - short version +counts_melted$tool_name <- sapply(as.character(counts_melted$tool), function(x){ unlist(strsplit(x, " "))[1] }) + +## PLOT +counts_p <- + ggplot(data=counts_melted, aes(x=tool, y=value, fill=variable)) + + geom_col(position = "dodge") + + scale_fill_manual(values=ASM_TOOL_COLORS$notmeth, guide="legend") + + facet_wrap(vars(tool_name), nrow=1, scales="free_x") + + labs( + # title="", + # subtitle="", + x="", + y="Number of proteins/genes" + ) + + theme_bw() + + theme( + # legend + legend.title=element_blank(), + # grid + panel.grid=element_blank(), + # strip + strip.background=element_rect(fill="white"), + strip.text=element_text(size=12, color="black"), + # axes + axis.title=element_text(size=12, color="black"), + axis.text.y=element_text(size=9, color="black"), + axis.text.x=element_text(size=9, color="black", angle=90, vjust=0.5, hjust=1) + ) + +## PDF +pdf(snakemake@output$pdf, width=snakemake@params$width, height=snakemake@params$height) +print(counts_p) +dev.off() diff --git a/figures/src/fig_plasflow.R b/figures/src/fig_plasflow.R new file mode 100644 index 0000000000000000000000000000000000000000..927ccce75255779cd20df47e920057ef6afd804f --- /dev/null +++ b/figures/src/fig_plasflow.R @@ -0,0 +1,63 @@ +#!/usr/bin/Rscript + +## LOG FILE +sink(file=file(snakemake@log[[1]], open="wt"), type="message") + +## NOTE +# Plot PlasFlow statistics + +## IMPORT +suppressMessages(library(testit)) +suppressMessages(library(UpSetR)) +suppressMessages(library(ggplot2)) +suppressMessages(library(reshape2)) +# custom +source(snakemake@params$utils) + +## DATA +stats <- read.csv( + file=snakemake@input$stats, + sep="\t", + header=TRUE, + stringsAsFactors=FALSE, + check.names=FALSE +) +# change names +stats$tool <- ASM_TOOL_NAMES[stats$tool] +# reshape +stats_melted <- reshape2::melt(stats, id.vars=c("label", "type", "tool"), variable.name="statstype") + +## PLOT +plots <- list() +dtype <- "class" +for(stype in unique(stats_melted$statstype)){ + plots[[sprintf("%s_%s", dtype, stype)]] <- + ggplot( + data=stats_melted[stats_melted$statstype == stype & stats_melted$type == dtype,], + aes_string(x="tool", y="value", fill="label") + ) + + geom_col(position="dodge") + + scale_fill_manual(values=PLASFLOW_COLORS[[dtype]], guide="legend") + + labs( + # title="", + # subtitle="", + x="", + y=PLASFLOW_NAMES$statstype[stype] + ) + + theme_bw() + + theme( + # legend + legend.title=element_blank(), + # grid + panel.grid=element_blank(), + # axes + axis.title=element_text(size=12, color="black"), + axis.text.y=element_text(size=9, color="black"), + axis.text.x=element_text(size=9, color="black", angle=90, vjust=0.5, hjust=1) + ) +} + +## PDF +pdf(snakemake@output$pdf, width=snakemake@params$width, height=snakemake@params$height) +for(pp in plots){ print(pp) } +dev.off() diff --git a/figures/src/fig_quast.R b/figures/src/fig_quast.R new file mode 100644 index 0000000000000000000000000000000000000000..e6bbdad55ef5d1cdf77d1888ac25f90565efe02f --- /dev/null +++ b/figures/src/fig_quast.R @@ -0,0 +1,66 @@ +#!/usr/bin/Rscript + +## LOG FILE +sink(file=file(snakemake@log[[1]], open="wt"), type="message") + +## NOTE +# Plot QUAST statistics + +## IMPORT +suppressMessages(library(testit)) +suppressMessages(library(ggplot2)) +suppressMessages(library(reshape2)) +# custom +source(snakemake@params$utils) + +## DATA +stats <- read.csv( + file=snakemake@input$stats, + sep="\t", + header=TRUE, + stringsAsFactors=FALSE, + check.names=FALSE +) +# change tool names +colnames(stats) <- sapply(colnames(stats), function(x){ ifelse(x %in% names(ASM_TOOL_NAMES), ASM_TOOL_NAMES[x], x) }) +# stat.s to plot +stats_vars <- c( + "# contigs", + "Largest contig", + "Total length", + "N50", + "L50", + "# N's per 100 kbp" +) +# reshape +stats_melted <- reshape2::melt(stats, id.vars="Assembly") + +## PLOT +stats_p <- + ggplot(data=stats_melted[sapply(stats$Assembly, function(x){ x %in% stats_vars }),], aes(x=variable, y=value)) + + geom_col(aes(fill=variable)) + + scale_fill_manual(values=c(ASM_TOOL_COLORS$notmeth, ASM_TOOL_COLORS$meth), guide=NULL) + + facet_wrap(vars(Assembly), ncol=1, scales="free_y") + + labs( + # title="", + # subtitle="", + x="", + y="QUAST statistic" + ) + + theme_bw() + + theme( + # grid + panel.grid=element_blank(), + # strip + strip.background=element_rect(fill="white"), + strip.text=element_text(size=12, color="black"), + # axes + axis.title=element_text(size=12, color="black"), + axis.text.y=element_text(size=9, color="black"), + axis.text.x=element_text(size=9, color="black", angle=90, vjust=0.5, hjust=1) + ) + +## PDF +pdf(snakemake@output$pdf, width=snakemake@params$width, height=snakemake@params$height) +print(stats_p) +dev.off() diff --git a/figures/src/fig_rgi.R b/figures/src/fig_rgi.R new file mode 100644 index 0000000000000000000000000000000000000000..eb9cf47d8cd0dc2da93a22026eba7af46a32f7f8 --- /dev/null +++ b/figures/src/fig_rgi.R @@ -0,0 +1,115 @@ +#!/usr/bin/Rscript + +## LOG FILE +sink(file=file(snakemake@log[[1]], open="wt"), type="message") + +## NOTE +# Plot RGI statistics + +## IMPORT +suppressMessages(library(testit)) +suppressMessages(library(UpSetR)) +suppressMessages(library(ggplot2)) +suppressMessages(library(reshape2)) +suppressMessages(library(UpSetR)) +# custom +source(snakemake@params$utils) + +## DATA +stats <- read.csv( + file=snakemake@input$stats, + sep="\t", + header=TRUE, + stringsAsFactors=FALSE, + check.names=FALSE +) +# change names +colnames(stats) <- sapply(colnames(stats), function(x){ ifelse(x %in% names(ASM_TOOL_NAMES), ASM_TOOL_NAMES[x], x) }) +# reshape +stats_melted <- reshape2::melt(stats, id.vars=c("label", "col", "type"), variable.name="tool", value.name="count") +# change names +# stats_melted$tool <- ASM_TOOL_NAMES[as.character(stats_melted$tool)] + +## PLOT: counts +my_theme <- + theme_bw() + + theme( + # grid + panel.grid=element_blank(), + # strip + strip.background=element_rect(fill="white"), + strip.text=element_text(size=9, color="black"), + # axes + axis.title=element_text(size=12, color="black"), + axis.text.y=element_text(size=9, color="black"), + axis.text.x=element_text(size=9, color="black", angle=90, vjust=0.5, hjust=1) + ) + +plots <- list() +for(ctype in unique(stats_melted$type)){ + # Counts per feature type + for(col in unique(stats_melted$col)){ + df <- stats_melted[stats_melted$type == ctype & stats_melted$col == col,] + plots[[sprintf("%s_%s", ctype, col)]] <- + ggplot(data=df, aes(x=tool, y=count, fill=tool)) + + geom_col() + + scale_fill_manual(values=ASM_TOOL_COLORS$notmeth, guide=NULL) + + facet_wrap(vars(label), ncol=round(sqrt(length(unique(df$label)))), scales="free_y") + + labs( + # title="", + subtitle=sprintf("Hits: %s", ctype), + x="", + y=RGI_NAMES$col[col] + ) + + my_theme + } +} +# Total counts +df <- stats_melted[stats_melted$col == "ARO",] +df <- aggregate(df$count, by=list(tool=df$tool, type=df$type), FUN=sum) +plots[[sprintf("%s_%s", ctype, "total")]] <- + ggplot(data=df, aes(x=tool, y=x, fill=tool)) + + geom_col() + + scale_fill_manual(values=ASM_TOOL_COLORS$notmeth, guide=NULL) + + facet_wrap(vars(type), ncol=3) + + labs( + x="", + y="Total hits" + ) + + my_theme + +## PLOT: intersection & PDF +plot_overlap <- function(df, ctype, col){ + df <- df[df$type == ctype & df$col == col,] + df_list <- lapply(ASM_TOOL_NAMES_NOTMETH, function(x){ df[df[,x] > 0,"label"] }) + names(df_list) <- ASM_TOOL_NAMES_NOTMETH[names(df_list)] + UpSetR::upset( + data=UpSetR::fromList(df_list), + # overlap order + order.by="degree", + decreasing=FALSE, + # y-label title + mainbar.y.label=sprintf("Intersection size (%s hits, %s)", ctype, RGI_NAMES$col[col]), + # text size + text.scale = c(2, 2, 1.5, 1.5, 2, 2), + # colors + set.metadata=list( + data=data.frame( + sets=names(df_list), + Tool=names(df_list), + stringsAsFactors=FALSE + ), + plots=list(list(type="matrix_rows", column="Tool", colors=ASM_TOOL_COLORS$notmeth, alpha=0.7)) + ) + ) +} + + +pdf(snakemake@output$pdf, width=snakemake@params$width, height=snakemake@params$height) +# Count plots +for(pp in plots){ print(pp) } +# Intersection plot +plot_overlap(stats, "all", "ARO") +plot_overlap(stats, "nudged", "ARO") +plot_overlap(stats, "strict", "ARO") +dev.off() diff --git a/figures/src/utils.R b/figures/src/utils.R new file mode 100644 index 0000000000000000000000000000000000000000..eb191361b41ba75f239d6a87e9136ae15dbc358c --- /dev/null +++ b/figures/src/utils.R @@ -0,0 +1,103 @@ +#!/usr/bin/Rscript + +suppressMessages(library(ggsci)) # colors + +############################## +# Assemblers +# how the tool names should be changed +ASM_TOOL_NAMES_NOTMETH <- c( # IMPORTANT: the order matters !!! + # not methylation-aware + "flye"="Flye", + "megahit"="MEGAHIT", + "metaspades_hybrid"="metaSPAdes (H)", + "metaspades"="metaSPAdes" +) +ASM_TOOL_NAMES_METH <- c( # IMPORTANT: the order matters !!! + # methylation-aware + "meth_flye"="Flye (M)", + "meth_megahit"="MEGAHIT (M)", + "meth_metaspades_hybrid"="metaSPAdes (M, H)", + "meth_metaspades"="metaSPAdes (M)" +) +ASM_TOOL_NAMES <- c( # IMPORTANT: the order matters !!! + ASM_TOOL_NAMES_METH, + ASM_TOOL_NAMES_NOTMETH +) +# tool colors +ASM_TOOL_COLORS <- list() +# methylation-aware +ASM_TOOL_COLORS$meth <- ggsci::pal_nejm("default", alpha=1)(4) +names(ASM_TOOL_COLORS$meth) <- ASM_TOOL_NAMES_METH +ASM_TOOL_COLORS$meth_raw <- ASM_TOOL_COLORS$meth +names(ASM_TOOL_COLORS$meth_raw) <- names(ASM_TOOL_NAMES_METH) +# not methylation-aware +ASM_TOOL_COLORS$notmeth <- ggsci::pal_nejm("default", alpha=0.7)(4) +names(ASM_TOOL_COLORS$notmeth) <- ASM_TOOL_NAMES_NOTMETH +ASM_TOOL_COLORS$notmeth_raw <- ASM_TOOL_COLORS$notmeth +names(ASM_TOOL_COLORS$notmeth_raw) <- names(ASM_TOOL_NAMES_NOTMETH) + +############################## +# Gene tools +GENE_TOOL_NAMES <- c( + "prodigal_partial"="Prodigal (partial)", + "prodigal_total"="Prodigal (total)", + "cdhit_unique"="CD-HIT (unique)", + "cdhit_total"="CD-HIT (total)" +) + +############################## +# Read selection methods +RSEL_TOOL_NAMES <- c( + "size_selected"="size selected", + "gtube"="g-TUBE", + "merged"="merged" +) + +############################## +# Methylation flag +METHYLATION_NAMES <- c( + "M"="Methylation aware", + "N"="Non-methylation aware" +) + +METHYLATION_COLORS <- list( + raw=ggsci::pal_nejm("default", alpha=1)(6)[c(5,6)] +) +METHYLATION_COLORS$new <- METHYLATION_COLORS$raw +names(METHYLATION_COLORS$raw) <- names(METHYLATION_NAMES) +names(METHYLATION_COLORS$new) <- METHYLATION_NAMES + +############################## +# CRISPR tools +CRISPR_TOOL_NAMES <- c( + "minced"="MinCED", + "casc"="CasC" +) + +############################## +# PlasFlow +PLASFLOW_NAMES <- list( + statstype=c( + count="Sequence count", + sum="Cumulative sequence length [bp]", + count_pct="Sequence count [%]", + sum_pct="Cumulative sequence length [%]" + ) +) + +PLASFLOW_COLORS <- list( + class=ggsci::pal_nejm("default", alpha=1)(4)[c(2,3,4)] +) +names(PLASFLOW_COLORS$class) <- c("chromosome", "plasmid", "unclassified") + +############################## +# RGI +RGI_NAMES <- list( + col=c( + "Best_Hit_ARO"="ARO term", + "ARO"="ARO", + "Drug Class"="Drug class", + "Resistance Mechanism"="Resistance mechanism", + "AMR Gene Family"="Gene family" + ) +) \ No newline at end of file diff --git a/figures/src/utils.py b/figures/src/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2a71dcf17fbc83f1f23431a8699fb2116f21d173 --- /dev/null +++ b/figures/src/utils.py @@ -0,0 +1,79 @@ +#!/usr/bin/python + +def parse_casc_report(ifile_path): + import pandas + # read in + summary = pandas.read_csv(ifile_path, sep="\t", header=0, usecols=["#seq_id", "spacers", "array_start", "array_stop", "bonafide"], comment=None) + # rename columns + summary.rename(columns={"#seq_id": "seq_id"}, inplace=True) + # filter + summary = summary[summary["bonafide"]] + assert all(summary["bonafide"]) # just a check + # drop columns + summary.drop(columns=["bonafide"], inplace=True) + return summary + +def parse_minced_report(ifile_path): + """ + Parse the MINCED report: for each sequence and found CRISPR array, extract + - array start and end + - number of spacers = number of unique spacer sequences + + Entry (sequence/CRISPR array) format: + Sequence '<string>' (<int> bp) + + CRISPR <int> Range: <int> - <int> + POSITION REPEAT SPACER + -------- ------------------------------------ ------------------------------ + <int> <string: ACTG> <string: ACTG> [ <int>, <int> ] + <...> + <int> <string: ACTG> + -------- ------------------------------------ ------------------------------ + Repeats: <int> Average Length: <int> Average Length: <int> + + CRISPR <int> Range: <int> - <int> + <...> + + Time to find repeats: <int> ms + """ + import re + summary = [] + with open(ifile_path, "r") as ifile: + new_seq = 0 + seq_id = None + for line in ifile: + if re.match("Sequence ", line): + assert new_seq == 0, "Unexpected parsing error in {} in {}".format(line, ifile_path) + line_re = re.search(re.compile("^Sequence '(?P<seq_id>[\w\.]+)' \((?P<seq_len>\d+) bp\)$"), line) + assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path) + seq_id = line_re.group("seq_id") + new_seq = 1 + elif re.match("CRISPR ", line): + assert new_seq == 1, "No seq before {} in {}".format(line, ifile_path) + line_re = re.search(re.compile("^CRISPR (?P<crispr_id>\d+)\s+Range: (?P<array_start>\d+) - (?P<array_stop>\d+)$"), line) + assert line_re, "Could not extract info from \"{}\" in {}".format(line, ifile_path) + summary.append({ + "seq_id": seq_id, + "array_start": int(line_re.group("array_start")), + "array_stop": int(line_re.group("array_stop")), + "spacers": set() + }) + new_seq += 1 + elif re.match("POSITION", line): + assert new_seq == 2, "No new seq/CRISPR before {} in {}".format(line, ifile_path) + new_seq += 1 + elif re.match("[0-9]+\s+", line): + assert re.match("[0-9]+\s+[ACTG]+", line), "Unexpected repeat/spacer format in {} in {}".format(line, ifile_path) + assert new_seq == 3, "No new seq./CRISPR before {} in {}".format(line, ifile_path) + if re.match("[0-9]+\s+[ACTG]+\s+[ACTG]+\s+", line): + line_re = re.search(re.compile("^[0-9]+\s+(?P<repeat>[ACTG]+)\s+(?P<spacer>[ACTG]+)\s+.*$"), line) + assert line_re, "Could not extract repeat/spacer from \"{}\" in {}".format(line, ifile_path) + summary[len(summary)-1]["spacers"].add(line_re.group("spacer")) + elif re.match("Repeats:", line): + new_seq = 1 + summary[len(summary)-1]["spacers"] = len(summary[len(summary)-1]["spacers"]) + elif re.match("Time to find repeats", line): + assert new_seq == 1 and summary[len(summary)-1]["spacers"] > 0, "No new seq./CRISPR/spacers before {} in {}".format(line, ifile_path) + new_seq = 0 + seq_id = None + return summary diff --git a/figures/src_SBB/2019_GDB_partial_genes.R b/figures/src_SBB/2019_GDB_partial_genes.R new file mode 100644 index 0000000000000000000000000000000000000000..29035438973ca5d65cd7350263ad30b10fb6ebf1 --- /dev/null +++ b/figures/src_SBB/2019_GDB_partial_genes.R @@ -0,0 +1,33 @@ +setwd("~/Documents/R_data/nanopore_ONT/") +library(tidyverse) +partial <- read.table("2019_GDB_protein_partial_genes.txt", header = TRUE) + +# pivoting the dataset +long_merged <- pivot_longer(partial, cols=c(-tool), names_to = "group", values_to = "counts") + +# setting a common theme +theme_set(theme_bw() + + theme(strip.background = element_rect(colour = "black", fill = "white"), strip.text = element_text(size=17, family="Helvetica")) + + theme(axis.text.x = element_text(angle = 90, size = 18, family = "Helvetica", face = "bold", color = "black"), + axis.text.y = element_text(size = 18, family = "Helvetica"), + axis.title.y = element_text(size = 18, face = "bold", family = "Helvetica"), + axis.title.x=element_text(size = 18, face = "bold", family = "Helvetica"), + axis.ticks.x=element_blank()) + + theme(plot.title = element_text(hjust = 0.5)) + + theme(legend.text = element_text(size = 18, family = "Helvetica"), + legend.title = element_text(size = 18, family = "Helvetica", face = "bold"), legend.direction = "vertical", legend.box = "vertical")) + + +# Basic plots +install.packages("devtools") +library(devtools) +install_github("kassambara/easyGgplot2") +library(easyGgplot2) +data_long <- long_merged + +ggplot(data_long) + +geom_bar(aes(fill=group, y=counts, x=tool), stat = "identity", position=position_dodge()) + scale_fill_manual("assembly", values = c("metaspades" = "grey", "metaspades_hybrid" = "dodgerblue2")) + + +ggplot2.barplot(data=data_long, xName='tool', yName='counts', + groupName='group', position=position_dodge()) diff --git a/figures/src_SBB/MAGS_2019_GDB_analyses.R b/figures/src_SBB/MAGS_2019_GDB_analyses.R new file mode 100644 index 0000000000000000000000000000000000000000..253961e374b7e3e4418d6d650fc2baa8bfe62357 --- /dev/null +++ b/figures/src_SBB/MAGS_2019_GDB_analyses.R @@ -0,0 +1,133 @@ +# Assessing the number and kind of bins found due to the different assembly and mapping methods used for the 2019_GDB ONT analsyes + +setwd("~/Documents/R_data/nanopore_ONT/") +library(tidyverse) + +# importing files and renaming columns +bwa_lr_MH <- read.table("bwa_lr_metaspades_hybrid_HQ_bins.txt", header = FALSE) %>% rename(bin = V1, taxonomy = V2, completion = V3, contamination = V4, strain_heterogeneity = V5) +bwa_sr_MH <- read.table("bwa_sr_metaspades_hybrid_HQ_bins.txt", header = FALSE) %>% rename(bin = V1, taxonomy = V2, completion = V3, contamination = V4, strain_heterogeneity = V5) +bwa_merged_MH <- read.table("bwa_merged_metaspades_hybrid_HQ_bins.txt", header = FALSE) %>% rename(bin = V1, taxonomy = V2, completion = V3, contamination = V4, strain_heterogeneity = V5) +mmi_lr_MH <- read.table("mmi_lr_metaspades_hybrid_HQ_bins.txt", header = FALSE) %>% rename(bin = V1, taxonomy = V2, completion = V3, contamination = V4, strain_heterogeneity = V5) +mmi_sr_MH <- read.table("mmi_sr_metaspades_hybrid_HQ_bins.txt", header = FALSE) %>% rename(bin = V1, taxonomy = V2, completion = V3, contamination = V4, strain_heterogeneity = V5) +mmi_merged_MH <- read.table("mmi_merged_metaspades_hybrid_HQ_bins.txt", header = FALSE) %>% rename(bin = V1, taxonomy = V2, completion = V3, contamination = V4, strain_heterogeneity = V5) +flye <- read.table("flye_HQ_bins.txt", header = FALSE) %>% rename(bin = V1, taxonomy = V2, completion = V3, contamination = V4, strain_heterogeneity = V5) +megahit <- read.table("megahit_HQ_bins.txt", header = FALSE) %>% rename(bin = V1, taxonomy = V2, completion = V3, contamination = V4, strain_heterogeneity = V5) + +# removing the "bin" column and separating taxonomy into individual columns +bwa_lr_MH_edited <- bwa_lr_MH %>% select(-bin) %>% + separate(taxonomy, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";", remove = FALSE) +bwa_sr_MH_edited <- bwa_sr_MH %>% select(-bin) %>% + separate(taxonomy, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";", remove = FALSE) +bwa_merged_MH_edited <- bwa_merged_MH %>% select(-bin) %>% + separate(taxonomy, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";", remove = FALSE) +mmi_lr_MH_edited <- mmi_lr_MH %>% select(-bin) %>% + separate(taxonomy, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";", remove = FALSE) +mmi_sr_MH_edited <- mmi_sr_MH %>% select(-bin) %>% + separate(taxonomy, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";", remove = FALSE) +mmi_merged_MH_edited <- mmi_merged_MH %>% select(-bin) %>% + separate(taxonomy, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";", remove = FALSE) +flye_edited <- flye %>% select(-bin) %>% + separate(taxonomy, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";", remove = FALSE) +megahit_edited <- megahit %>% select(-bin) %>% + separate(taxonomy, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";", remove = FALSE) + + +# Keeping only the Family and Genus columns and adding family if the latter is missing +bwa_lr_tax <- bwa_lr_MH_edited %>% select(Family, Genus, completion, contamination, strain_heterogeneity) %>% mutate(Genus = gsub("g__", "", Genus), Family = gsub("f__", "", Family)) %>% mutate_if(is.character, list(~na_if(.,""))) + +bwa_sr_tax <- bwa_sr_MH_edited %>% select(Family, Genus, completion, contamination, strain_heterogeneity) %>% mutate(Genus = gsub("g__", "", Genus), Family = gsub("f__", "", Family)) %>% mutate_if(is.character, list(~na_if(.,""))) + +bwa_merged_tax <- bwa_merged_MH_edited %>% select(Family, Genus, completion, contamination, strain_heterogeneity) %>% mutate(Genus = gsub("g__", "", Genus), Family = gsub("f__", "", Family)) %>% mutate_if(is.character, list(~na_if(.,""))) + +mmi_lr_tax <- mmi_lr_MH_edited %>% select(Family, Genus, completion, contamination, strain_heterogeneity) %>% mutate(Genus = gsub("g__", "", Genus), Family = gsub("f__", "", Family)) %>% mutate_if(is.character, list(~na_if(.,""))) + +mmi_sr_tax <- mmi_sr_MH_edited %>% select(Family, Genus, completion, contamination, strain_heterogeneity) %>% mutate(Genus = gsub("g__", "", Genus), Family = gsub("f__", "", Family)) %>% mutate_if(is.character, list(~na_if(.,""))) + +mmi_merged_tax <- mmi_merged_MH_edited %>% select(Family, Genus, completion, contamination, strain_heterogeneity) %>% mutate(Genus = gsub("g__", "", Genus), Family = gsub("f__", "", Family)) %>% mutate_if(is.character, list(~na_if(.,""))) + +flye_tax <- flye_edited %>% select(Family, Genus, completion, contamination, strain_heterogeneity) %>% mutate(Genus = gsub("g__", "", Genus), Family = gsub("f__", "", Family)) %>% mutate_if(is.character, list(~na_if(.,""))) + +megahit_tax <- megahit_edited %>% select(Family, Genus, completion, contamination, strain_heterogeneity) %>% mutate(Genus = gsub("g__", "", Genus), Family = gsub("f__", "", Family)) %>% mutate_if(is.character, list(~na_if(.,""))) + +# code replacing NAs in 'Genus' with the values from adjacent column 'Family' +bwa_lr_tax$Genus[is.na(bwa_lr_tax$Genus)] <- bwa_lr_tax$Family[is.na(bwa_lr_tax$Genus)] +bwa_sr_tax$Genus[is.na(bwa_sr_tax$Genus)] <- bwa_sr_tax$Family[is.na(bwa_sr_tax$Genus)] +bwa_merged_tax$Genus[is.na(bwa_merged_tax$Genus)] <- bwa_merged_tax$Family[is.na(bwa_merged_tax$Genus)] +mmi_lr_tax$Genus[is.na(mmi_lr_tax$Genus)] <- mmi_lr_tax$Family[is.na(mmi_lr_tax$Genus)] +mmi_sr_tax$Genus[is.na(mmi_sr_tax$Genus)] <- mmi_sr_tax$Family[is.na(mmi_sr_tax$Genus)] +mmi_merged_tax$Genus[is.na(mmi_merged_tax$Genus)] <- mmi_merged_tax$Family[is.na(mmi_merged_tax$Genus)] +flye_tax$Genus[is.na(flye_tax$Genus)] <- flye_tax$Family[is.na(flye_tax$Genus)] +megahit_tax$Genus[is.na(megahit_tax$Genus)] <- megahit_tax$Family[is.na(megahit_tax$Genus)] + +# making the Genus names unique for easier tracking and merging downstream +bwa_lr_tax <- bwa_lr_tax %>% mutate(Genus = make.unique(Genus)) %>% select(-Family) +bwa_sr_tax <- bwa_sr_tax %>% mutate(Genus = make.unique(Genus)) %>% select(-Family) +bwa_merged_tax <- bwa_merged_tax %>% mutate(Genus = make.unique(Genus)) %>% select(-Family) +mmi_lr_tax <- mmi_lr_tax %>% mutate(Genus = make.unique(Genus)) %>% select(-Family) +mmi_sr_tax <- mmi_sr_tax %>% mutate(Genus = make.unique(Genus)) %>% select(-Family) +mmi_merged_tax <- mmi_merged_tax %>% mutate(Genus = make.unique(Genus)) %>% select(-Family) +flye_tax <- flye_tax %>% mutate(Genus = make.unique(Genus)) %>% select(-Family) +megahit_tax <- megahit_tax %>% mutate(Genus = make.unique(Genus)) %>% select(-Family) + +# combining all samples based on the Genus, and making additional column with sample name +bwa_lr_final <- bwa_lr_tax %>% mutate(Genus = make.unique(Genus)) %>% select(Genus) %>% mutate(bwa_lr = Genus) +bwa_sr_final <- bwa_sr_tax %>% mutate(Genus = make.unique(Genus)) %>% select(Genus) %>% mutate(bwa_sr = Genus) +bwa_merged_final <- bwa_merged_tax %>% mutate(Genus = make.unique(Genus)) %>% select(Genus) %>% mutate(bwa_merged = Genus) +mmi_lr_final <- mmi_lr_tax %>% mutate(Genus = make.unique(Genus)) %>% select(Genus) %>% mutate(mmi_lr = Genus) +mmi_sr_final <- mmi_sr_tax %>% mutate(Genus = make.unique(Genus)) %>% select(Genus) %>% mutate(mmi_sr = Genus) +mmi_merged_final <- mmi_merged_tax %>% mutate(Genus = make.unique(Genus)) %>% select(Genus) %>% mutate(mmi_merged = Genus) +flye_final <- flye_tax %>% mutate(Genus = make.unique(Genus)) %>% select(Genus) %>% mutate(flye = Genus) +megahit_final<- megahit_tax %>% mutate(Genus = make.unique(Genus)) %>% select(Genus) %>% mutate(megahit = Genus) + +merged_2019_tax <- full_join(x = bwa_lr_final, y = bwa_sr_final, + by = "Genus") %>% + full_join( x = ., y = bwa_merged_final, by = "Genus") %>% + full_join( x = ., y = mmi_lr_final, by = "Genus") %>% + full_join( x = ., y = mmi_sr_final, by = "Genus") %>% + full_join( x = ., y = mmi_merged_final, by = "Genus") %>% + full_join( x = ., y = flye_final, by = "Genus")%>% + full_join( x = ., y = megahit_final, by = "Genus") +# %>% arrange(Genus) %>% replace(is.na(.), 0) + +long_tax <- pivot_longer(merged_2019_tax, cols = bwa_lr:megahit, names_to = "samples") + +# setting a common theme +theme_set(theme_bw() + + theme(strip.background = element_rect(colour = "black", fill = "white"), strip.text = element_text(size=17, family="Helvetica")) + + theme(axis.text.x = element_text(angle = 90, size = 18, family = "Helvetica"), + axis.text.y = element_text(size = 18, family = "Helvetica"), + axis.title.y = element_text(size = 18, face = "bold", family = "Helvetica"), + axis.title.x=element_text(size = 18, face = "bold", family = "Helvetica"), + axis.ticks.x=element_blank()) + + theme(plot.title = element_text(hjust = 0.5)) + + theme(legend.text = element_text(size = 18, family = "Helvetica"), + legend.title = element_text(size = 18, family = "Helvetica", face = "bold"), legend.direction = "vertical", legend.box = "vertical")) + +# plotting +ggplot(long_tax, aes(x = samples, y = value)) +long_tax <- long_tax %>% drop_na() + +long_tax %>% + # group_by(Genus) %>% + # mutate(rescale = scales::rescale(Sqrt.abundance)) %>% + ggplot(., aes(x = factor(samples), y = value)) + + geom_tile(aes(fill = Genus), color = "white") + +# scale_alpha(range = c(0.1, 1)) + + theme(legend.position = "none") + + # xlab(label = "Library Preparation Method") + + facet_grid(~ samples, switch = "x", scales = "free_x", space = "free_x") + + # scale_fill_gradient(name = "Sqrt(Abundance)", + # low = "#FFFFFF", + # high = "#012345") + + theme(strip.placement = "outside") + + theme(axis.text.x = element_blank(), + axis.text.y = element_text(color = scales::hue_pal()(length(levels(as.factor(long_tax$Genus)))))) + + ggtitle(label = "2019_GDB - MAGs") + +# Summary table +View(merged_2019_tax) + +ggplot(data.frame(long_tax), aes(x=samples, fill=Genus)) + + geom_bar(position = "fill") +#+ scale_fill_brewer(palette = "Dark2") + diff --git a/figures/src_SBB/mappability_index-2020-05-26.R b/figures/src_SBB/mappability_index-2020-05-26.R new file mode 100644 index 0000000000000000000000000000000000000000..c90bca4a718785ed073b54633d00510949da6030 --- /dev/null +++ b/figures/src_SBB/mappability_index-2020-05-26.R @@ -0,0 +1,144 @@ +setwd("~/Documents/R_data/nanopore_ONT/") +library(tidyverse) + +map_1000 <- read.delim("mappability_over_1000bp.txt") %>% separate(sample, c("sample", "mapped_reads", "total", "percent_mapped"), sep = " ", remove = FALSE) %>% mutate(sample = str_replace_all(sample, "_counts.txt", "")) +map_1000$mapped_reads = NULL +map_1000$total = NULL + +map_1500 <- read.delim("mappability_over_1500bp.txt") %>% separate(sample, c("sample", "mapped_reads", "total", "percent_mapped"), sep = " ", remove = FALSE) %>% mutate(sample = str_replace_all(sample, "_counts.txt", "")) +map_1500$mapped_reads = NULL +map_1500$total = NULL + +map_2000 <- read.delim("mappability_over_2000bp.txt") %>% separate(sample, c("sample", "mapped_reads", "total", "percent_mapped"), sep = " ", remove = FALSE) %>% mutate(sample = str_replace_all(sample, "_counts.txt", "")) +map_2000$mapped_reads = NULL +map_2000$total = NULL + +map_5000 <- read.delim("mappability_over_5000bp.txt") %>% separate(sample, c("sample", "mapped_reads", "total", "percent_mapped"), sep = " ", remove = FALSE) %>% mutate(sample = str_replace_all(sample, "_counts.txt", "")) +map_5000$mapped_reads = NULL +map_5000$total = NULL + +# merging all the files +merged_mappability <- full_join(x = map_1000, y = map_1500,by = "sample") %>% + full_join( x = ., y = map_2000, by = "sample") %>% + full_join( x = ., y = map_5000, by = "sample") + +columns <- c("samples", "bp1000", "bp1500", "bp2000", "bp5000") +colnames(merged_mappability) <- columns +write.csv(merged_mappability, "merged_mappability_index.csv", quote = FALSE) + +# long format +map_long <- pivot_longer(merged_mappability, cols = bp1000:bp5000, names_to = "cutoff", values_to = "count") %>% mutate(samples = str_replace_all(samples, "_counts.txt", "")) + +samps <- c("flye", "megahit", "hybrid_bwa_sr", "hybrid_bwa_lr", "hybrid_bwa_merged", + "hybrid_mmi_sr", "hybrid_mmi_lr", "hybrid_mmi_merged", "megahit_metaT_sr", "hybrid_metaT_sr") +write.csv(map_long, "merged_mappability_long_format.csv", quote = FALSE) + +### Plotting ### +# setting a common theme for plotting +theme_set(theme_bw() + + theme(strip.background = element_rect(colour = "black", fill = "white"), strip.text = element_text(size=17, family="Helvetica")) + + theme(axis.text.x = element_text(angle = 90, size = 18, family = "Helvetica"), + axis.text.y = element_text(size = 18, family = "Helvetica"), + axis.title.y = element_text(size = 18, face = "bold", family = "Helvetica"), + axis.title.x=element_text(size = 18, face = "bold", family = "Helvetica"), + axis.ticks.x=element_blank()) + + theme(plot.title = element_text(hjust = 0.5)) + + theme(legend.text = element_text(size = 18, family = "Helvetica"), + legend.title = element_text(size = 18, family = "Helvetica", face = "bold"), legend.direction = "vertical", legend.box = "vertical")) + +# Bubble plot +map_long %>% + mutate(samples = factor(samples, levels = samps)) %>% + ggplot(aes(x=samples, y=cutoff, size = as.numeric(count), color = cutoff)) + + geom_point(alpha=0.7) + + scale_size(range = c(.1, 15), name="percent_mapped") + + facet_grid(.~factor(samples, levels = c("flye", "megahit", "hybrid_bwa_sr", "hybrid_bwa_lr", "hybrid_bwa_merged", "hybrid_mmi_sr", "hybrid_mmi_lr", "hybrid_mmi_merged", "megahit_metaT_sr", "hybrid_metaT_sr")), scales = "free_x", space = "free_x") + + guides(color = FALSE) + +# without facet +map_long %>% + mutate(samples = factor(samples, levels = c("flye", "megahit", "hybrid_bwa_sr", "hybrid_bwa_lr", "hybrid_bwa_merged", "hybrid_mmi_sr", "hybrid_mmi_lr", "hybrid_mmi_merged", "megahit_metaT_sr", "hybrid_metaT_sr"))) %>% + ggplot(aes(x=samples, y=cutoff, size = as.numeric(count), color = cutoff)) + + geom_point(alpha=0.7) + + scale_size(range = c(.1, 15), name="percent_mapped") + + guides(color = FALSE) + +# heatmap +ggplot(data = map_long, mapping = aes(x = samples, + y = cutoff, + fill = as.numeric(count))) + + geom_tile() + + xlab(label = "Sample") + + scale_fill_gradient(name = "percent_mapped", + low = "#FFFFFF", + high = "#012345") + +# Color Brewer palette +library(viridis) +ggplot(map_long, aes(x = samples, + y = cutoff, + fill = as.numeric(count))) + + geom_tile() + + scale_fill_viridis(discrete=FALSE) + + theme_ipsum() + +############### +# Tables in R # +############### +# Trying tables, from here: https://www.littlemissdata.com/blog/prettytables +library(data.table) +library(dplyr) +# install.packages("formattable") +library(formattable) +library(tidyr) + +#Set a few color variables to make our table more visually appealing +customGreen0 = "#DeF7E9" +customGreen = "#71CA97" +customRed = "#ff7f7f" + +# working with our "real data" +i1 <- merged_mappability %>% arrange(factor(samples, levels = c("flye", "megahit", "hybrid_bwa_sr", "hybrid_bwa_lr", "hybrid_bwa_merged", "hybrid_mmi_sr", "hybrid_mmi_lr", "hybrid_mmi_merged", "megahit_metaT_sr", "hybrid_metaT_sr"))) + +formattable(i1) +#1) First Data Table +formattable(i1, + align =c("l","c","c","c","c"), + list(`samples` = formatter( + "span", style = ~ style(color = "darkblue",font.weight = "bold")) + )) + +#2) Add the color mapping for all 2011 to 2016. +formattable(i1, + align =c("l","c","c","c","c"), + list(`samples` = formatter( + "span", style = ~ style(color = "darkblue",font.weight = "bold")), + `bp1000`= color_tile(customGreen, customGreen0), + `bp1500`= color_tile(customGreen, customGreen0), + `bp2000`= color_tile(customGreen, customGreen0), + `bp5000`= color_tile(customGreen, customGreen0) +)) + +#4) Add custom formatter to improvement over time +percent_formatter <- + formatter("span", + style = x ~ style( + font.weight = "bold", + color = ifelse(x >= 90, customGreen, ifelse(x < 5, customRed, ifelse(x <= 79,"grey", "blue"))))) + +formattable(i1, + align =c("l","c","c","c","c"), + list(`samples` = formatter( + "span", style = ~ style(color = "darkblue",font.weight = "bold")), + `bp1000`= percent_formatter, + `bp1500`= percent_formatter, + `bp2000`= percent_formatter, + `bp5000`= percent_formatter + )) + +##### +# Reactable format table +# https://glin.github.io/reactable/index.html +#install.packages("reactable") +library(reactable) +reactable(merged_mappability) diff --git a/figures/src_SBB/mmseq_upset_plot_w_metaspades.R b/figures/src_SBB/mmseq_upset_plot_w_metaspades.R new file mode 100644 index 0000000000000000000000000000000000000000..cbdbe2c9871bd122dc73bc8551e213d7edad985b --- /dev/null +++ b/figures/src_SBB/mmseq_upset_plot_w_metaspades.R @@ -0,0 +1,130 @@ +library(cowplot) +library(tidyverse) +setwd("~/Documents/R_data/nanopore_ONT/") +overlap_sizes <- read.table("~/Documents/R_data/nanopore_ONT/overlap_sizes.txt") +Total <- read.table("~/Documents/R_data/nanopore_ONT/total.txt") + +sets <- c("flye", "megahit", "metaspades", "metaspades_hybrid") +category <- c( + "flye", + "megahit", + "metaspades", + "metaspades_hybrid", + "flye_megahit", + "flye_metaspades", + "flye_metaspades_hybrid", + "megahit_metaspades", + "megahit_metaspades_hybrid", + "metaspades_metaspades_hybrid", + "flye_megahit_metaspades_metaspades_hybrid" +) + +Overlap <- expand.grid( #generate all the combinations + "sets" = sets, + "category" = category +) %>% + as.data.frame() %>% #determine the intersection: a character col of Y or N. + mutate(intersect = case_when( + str_detect(category, sets %>% as.character()) ~ "Y", + T ~ "N" + )) + +Overlap[15,3] = "N" +Overlap[27,3] = "N" +Overlap[35,3] = "N" + +# Upper-left plot +upperleft <- Total %>% + ggplot(aes(x = V1, y= V2)) + + geom_hline(yintercept = -Inf, size = 1.5) + + geom_vline(xintercept = -Inf, size = 1.5) + + geom_bar(stat = "identity", aes(fill = V1), alpha = 0.8) + + geom_text(aes(label = as.character(V2)), size = 5, angle = 90, hjust = 0, y = 1, fontface = "bold") + + scale_fill_manual(values = c("orangered3", "seagreen", "grey", "dodgerblue2"), #this is my own custom palette + limits = c("flye", "megahit", "metaspades", "metaspades_hybrid")) + #feel free to use something else + scale_x_discrete(labels = NULL) + + scale_y_continuous(labels = NULL) + + labs(x = NULL, + y = "set size") + + theme_minimal() + + theme(legend.position = "none") + + theme(text = element_text(size= 16, face="bold")) + + theme(axis.text.x=element_text(colour = "black", angle = 45, hjust = 1)) + + theme(axis.text.y=element_text(colour = "black")) + + theme(panel.grid = element_blank()) +upperleft + +# Lower-left plot +Overlap$category <- factor(Overlap$category) +lowerleft <- Overlap %>% + mutate(category = factor(rev(Overlap$category))) %>% #in the colored matrix the first y value appears in the bottom, so the order need to be reversed + ggplot(aes(x = sets, y = category))+ + geom_tile(aes(fill = sets, alpha = intersect), color = "black", size = 1.5) + + scale_fill_manual(values = c("orangered1", "seagreen", "grey", "dodgerblue2"), + limits = c("flye", "megahit", "metaspades", "metaspades_hybrid")) + + scale_alpha_manual(values = c(0.8, 0), #color the grid for Y, don't color for N. + limits = c("Y", "N")) + + scale_y_discrete(labels = NULL) + + scale_x_discrete(labels = rep(" ", length(Overlap$sets))) + #I left white space here for better alignment w/ extended plots + labs(x = " ", #white space for better alignment w/ right side plots + y = "overlap") + + theme_minimal() + + theme(legend.position = "none") + + theme(text = element_text(size= 16, face="bold")) + + theme(axis.text.x=element_text(colour = "black")) + + theme(axis.text.y=element_text(colour = "black")) + + theme(panel.grid = element_blank()) +lowerleft + +# Upper-right plot +Assembly_method <- c("flye", "megahit", "metaspades", "metaspades_hybrid") +upperright <- get_legend( + Total %>% + ggplot(aes(x = V1, y= V2)) + + geom_hline(yintercept = -Inf, size = 1.5) + + geom_vline(xintercept = -Inf, size = 1.5) + + geom_bar(stat = "identity", aes(fill = Assembly_method), alpha = 0.9) + + geom_text(aes(label = "Assembly method"), size = 40, angle = 90, hjust = 0, y = 1, fontface = "bold") + + scale_fill_manual(values = c("orangered1", "seagreen", "grey", "dodgerblue2"), #this is my own custom palette + limits = c("flye", "megahit", "metaspades", "metaspades_hybrid")) + #feel free to use something else + scale_x_discrete(labels = NULL) + + scale_y_continuous(labels = NULL) + + labs(x = NULL, + y = "set size") + + theme_minimal() + + theme(legend.position = "right") + + theme(text = element_text(size= 16, face="bold")) + + theme(axis.text.x=element_text(colour = "black", angle = 45, hjust = 1)) + + theme(axis.text.y=element_text(colour = "black")) + + theme(panel.grid = element_blank()) +) +plot_grid(upperright) + +# Lower-right plot +overlap_sizes <- overlap_sizes %>% mutate(V2 = factor(V2, levels = rev(V2))) #the order needs to be reversed for the figure to match +lowerright <- overlap_sizes %>% + ggplot(aes(x = V2, y = V1)) + + geom_hline(yintercept = -Inf, size = 1.5) + + geom_vline(xintercept = -Inf, size = 1.5) + + geom_bar(stat = "identity", fill = "grey80", color = NA, alpha = 0.8) + + geom_text(aes(label = V1, y = 0), size = 5, hjust = 0, vjust = 0.5, fontface = "bold") + + scale_y_continuous(breaks = c(0, max(overlap_sizes$V1)) , + labels = rep(" ", 2)) + #I left white space here for better alignment w/ extended plots + scale_x_discrete(labels = NULL) + + labs(y = "intersect sizes", + x = NULL) + + theme_minimal() + + theme(text = element_text(size= 15, face="bold")) + + theme(axis.text.x=element_text(colour = "black", angle = 45, hjust = 1)) + + theme(axis.text.y=element_text(colour = "black")) + + theme(panel.grid = element_blank()) + + coord_flip() +lowerright + +pdf(file="ONT_protein_assembler_mmseq2_Upset_plot_w_metaspades.pdf") +plot_grid(upperleft, upperright, lowerleft, lowerright, + nrow = 2, + ncol = 2, + rel_heights = c(0.95, 1.5), #the more rows in the lower part, the longer it should be + rel_widths = c(0.95, 0.95)) +dev.off()