Skip to content
Snippets Groups Projects

WIP: Checkpoint snakefile

Merged Susheel Busi requested to merge checkpoint_snakefile into master
Compare and Show latest version
1 file
+ 41
0
Compare changes
  • Side-by-side
  • Inline
+ 41
0
#!/bin/bash -l
cd /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB
mkdir cd-hit
cd cd-hit
ln -s /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/annotation/proteins/metaspades/ONT3_MG_xx_Rashi_S11/final.contigs.faa metaspades.faa
ln -s /mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/results/annotation/proteins/metaspades_hybrid/lr_no_barcode-sr_ONT3_MG_xx_Rashi_S11/contigs.faa metaspades_hybrid.faa
grep -E -o "partial.{0,3}" metaspades.faa | grep '10\|01\|11' | wc -l > metaspades_partial_counts.txt
grep -E -o "partial.{0,3}" metaspades_hybrid.faa | grep '10\|01\|11' | wc -l > metaspades_hybrid_partial_counts.txt
# getting the counts in one file
for file in *.txt; do echo $file; cat $file; done >> counts
cat counts | paste - - > partial_gene_counts.txt
# editing fasta headers to make it easier after cd-hit clustering
conda activate bbmap
rename.sh in=metaspades.faa out=spades.faa prefix=spades ignorejunk=t
rename.sh in=metaspades_hybrid.faa out=hybrid.faa prefix=hybrid ignorejunk=t
# si # interactive mode on IRIS
conda activate cd-hit
cd-hit-2d -i spades.faa -i2 hybrid.faa -o spades_hybrid -c 0.9 -n 5 -d 0 -M 16000 -T 8
cd-hit-2d -i hybrid.faa -i2 spades.faa -o hybrid_spades -c 0.9 -n 5 -d 0 -M 16000 -T 8
# determining number of unique sequences
# according to http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d
# CD-HIT-2D outputs two files: a fasta file of proteins in "db2" that are not similar to db1 and a text file that lists similar sequences between db1 & db2
grep -c '>' spades_hybrid # db2 == hybrid
# 63911
grep -c '>' hybrid_spades # db2 == spades
# 27526
# making plots for all .clstr files (http://weizhongli-lab.org/lab-wiki/doku.php?id=cd-hit-user-guide#cd-hit-2d)
for file in *.clstr
do
echo "$file"
plot_len1.pl "$file" \
1,2-4,5-9,10-19,20-49,50-99,100-299,500-99999 \
10-59,60-149,150-499,500-1999,2000-999999
done >> cluster_plots
\ No newline at end of file
Loading