diff --git a/2019_GDB/scripts/cdhit_comparisons.sh b/2019_GDB/scripts/cdhit_comparisons.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5374f8f037336f0cf5825616697781e3f2827091
--- /dev/null
+++ b/2019_GDB/scripts/cdhit_comparisons.sh
@@ -0,0 +1,41 @@
+#!/bin/bash -l
+cd /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB
+
+mkdir cd-hit
+cd cd-hit
+ln -s /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/annotation/proteins/metaspades/ONT3_MG_xx_Rashi_S11/final.contigs.faa metaspades.faa
+ln -s /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/annotation/proteins/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.faa metaspades_hybrid.faa
+
+grep -E -o "partial.{0,3}" metaspades.faa | grep '10\|01\|11' | wc -l > metaspades_partial_counts.txt
+grep -E -o "partial.{0,3}" metaspades_hybrid.faa | grep '10\|01\|11' | wc -l > metaspades_hybrid_partial_counts.txt
+
+# getting the counts in one file
+for file in *.txt; do echo $file; cat $file; done >> counts
+cat counts | paste - - > partial_gene_counts.txt
+
+# editing fasta headers to make it easier after cd-hit clustering
+conda activate bbmap
+rename.sh in=metaspades.faa out=spades.faa prefix=spades ignorejunk=t
+rename.sh in=metaspades_hybrid.faa out=hybrid.faa prefix=hybrid ignorejunk=t
+
+# si    # interactive mode on IRIS
+conda activate cd-hit
+cd-hit-2d -i spades.faa -i2 hybrid.faa -o spades_hybrid -c 0.9 -n 5 -d 0 -M 16000 -T 8
+cd-hit-2d -i hybrid.faa -i2 spades.faa -o hybrid_spades -c 0.9 -n 5 -d 0 -M 16000 -T 8
+
+# determining number of unique sequences
+# according to http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d
+# CD-HIT-2D outputs two files: a fasta file of proteins in "db2" that are not similar to db1 and a text file that lists similar sequences between db1 & db2
+grep -c '>' spades_hybrid   # db2 == hybrid
+# 63911
+grep -c '>' hybrid_spades   # db2 == spades
+# 27526
+
+# making plots for all .clstr files (http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d)
+for file in *.clstr
+do
+    echo "$file"
+    plot_len1.pl "$file"  \
+           1,2-4,5-9,10-19,20-49,50-99,100-299,500-99999   \
+                 10-59,60-149,150-499,500-1999,2000-999999
+done >> cluster_plots
\ No newline at end of file