diff --git a/2019_GDB/scripts/cdhit_comparisons.sh b/2019_GDB/scripts/cdhit_comparisons.sh new file mode 100644 index 0000000000000000000000000000000000000000..5374f8f037336f0cf5825616697781e3f2827091 --- /dev/null +++ b/2019_GDB/scripts/cdhit_comparisons.sh @@ -0,0 +1,41 @@ +#!/bin/bash -l +cd /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB + +mkdir cd-hit +cd cd-hit +ln -s /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/annotation/proteins/metaspades/ONT3_MG_xx_Rashi_S11/final.contigs.faa metaspades.faa +ln -s /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/annotation/proteins/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.faa metaspades_hybrid.faa + +grep -E -o "partial.{0,3}" metaspades.faa | grep '10\|01\|11' | wc -l > metaspades_partial_counts.txt +grep -E -o "partial.{0,3}" metaspades_hybrid.faa | grep '10\|01\|11' | wc -l > metaspades_hybrid_partial_counts.txt + +# getting the counts in one file +for file in *.txt; do echo $file; cat $file; done >> counts +cat counts | paste - - > partial_gene_counts.txt + +# editing fasta headers to make it easier after cd-hit clustering +conda activate bbmap +rename.sh in=metaspades.faa out=spades.faa prefix=spades ignorejunk=t +rename.sh in=metaspades_hybrid.faa out=hybrid.faa prefix=hybrid ignorejunk=t + +# si # interactive mode on IRIS +conda activate cd-hit +cd-hit-2d -i spades.faa -i2 hybrid.faa -o spades_hybrid -c 0.9 -n 5 -d 0 -M 16000 -T 8 +cd-hit-2d -i hybrid.faa -i2 spades.faa -o hybrid_spades -c 0.9 -n 5 -d 0 -M 16000 -T 8 + +# determining number of unique sequences +# according to http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d +# CD-HIT-2D outputs two files: a fasta file of proteins in "db2" that are not similar to db1 and a text file that lists similar sequences between db1 & db2 +grep -c '>' spades_hybrid # db2 == hybrid +# 63911 +grep -c '>' hybrid_spades # db2 == spades +# 27526 + +# making plots for all .clstr files (http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d) +for file in *.clstr +do + echo "$file" + plot_len1.pl "$file" \ + 1,2-4,5-9,10-19,20-49,50-99,100-299,500-99999 \ + 10-59,60-149,150-499,500-1999,2000-999999 +done >> cluster_plots \ No newline at end of file