diff --git a/rules/Assembly/MGMT.rules b/rules/Assembly/MGMT.rules index a607f8a972c4c24a951241e88a668f671712dba9..e12875a916b2acea433ddc234b6364a4a97b8019 100644 --- a/rules/Assembly/MGMT.rules +++ b/rules/Assembly/MGMT.rules @@ -294,3 +294,90 @@ rule ASSEMBLY_MT_MAPPING: # index samtools index $PREFIX.sorted.bam """ + + +################ +# ## MGMT FAST ASSEMBLY USING MEGAHIT +################ + +rule ASSEMBLY_MG_FAST_1: + log: + A_LOG + benchmark: + "%s/benchmarks/ASSEMBLY_MT_ASSEMBLY_1.json" % A_OUT + input: + preprocessed_mg('R1'), + preprocessed_mg('R2'), + preprocessed_mg('SE'), + '{dir}/MT.assembly_1/final.contigs.fa'.format(dir=A_OUT) + output: + '{dir}/MGMT.assembly_1/final.contigs.fa'.format(dir=A_OUT) + params: + outdir = "{dir}/MGMT.assembly_1".format(dir=A_OUT) + shell: + """ + MAX_READ_LEN=$(cat {input} | sed -n '1~4s/^@/>/p;2~4p' |\ + awk '$0 ~ \">\" {{c=0\"\t\"; }} $0 !~ \">\" {{c+=length($0); max=(max>c)?max:c;}} END {{print max}}') + echo "Max read length: $MAX_READ_LEN" + MEMBYTES=$(({MEMTOTAL}*1000*1000*1000)) + echo "Available memory in bytes: $MEMBYTES" + megahit -o {params.outdir} --cpu-only -m $MEMBYTES --mem-flag 1\ + -l $MAX_READ_LEN --input-cmd "cat {input}" -t {THREADS} --continue > {log} 2>&1 + """ + +rule ASSEMBLY_MGMT_EXTRACT_UNMAPPED_FROM_FAST_1: + log: + A_LOG + benchmark: + "%s/benchmarks/ASSEMBLY_MT_EXTRACT_UNMAPPED_FROM_ASSEMBLY_1.json" % A_OUT + input: + preprocessed_mg('R1'), + preprocessed_mg('R2'), + preprocessed_mg('SE'), + '{dir}/MGMT.assembly_1/final.contigs.fa'.format(dir=A_OUT), + expand('{dir}/MGMT.assembly_1/final.contigs.fa.{ext}', dir=A_OUT, ext=['amb', 'bwt', 'pac', 'sa', 'ann']) + output: + expand('{dir}/{name}', name=[ + 'MG.R1.unmapped.fq', + 'MG.R2.unmapped.fq', + 'MG.SE.unmapped.fq'], dir=A_OUT) + shell: + """ + TMP_FILE=$(mktemp --tmpdir={TMPDIR} -t "alignment_XXXXXX.bam") + BUFFER=$(mktemp --tmpdir={TMPDIR} -t "alignment_buffer_XXXXXX.bam") + bwa mem -v 1 -t {THREADS} {input[3]} {input[0]} {input[1]} | samtools view -@ {THREADS} -bS - > $TMP_FILE + samtools merge -@ {THREADS} -u - \ + <(samtools view -@ {THREADS} -u -f 4 -F 264 $TMP_FILE) \ + <(samtools view -@ {THREADS} -u -f 8 -F 260 $TMP_FILE) \ + <(samtools view -@ {THREADS} -u -f 12 -F 256 $TMP_FILE) | \ + samtools view -@ {THREADS} -bF 0x800 - | samtools sort -o -@ {THREADS} -m {MEMCORE}G -n - $BUFFER | \ + bamToFastq -i stdin -fq {output[0]} -fq2 {output[1]} + bwa mem -v 1 -t {THREADS} {input[3]} {input[2]} | \ + samtools view -@ {THREADS} -bS - | samtools view -@ {THREADS} -uf 4 - | \ + bamToFastq -i stdin -fq {output[2]} + rm -rf $BUFFER* $TMP_FILE + """ + +rule ASSEMBLY_MGMT_FAST_2: + log: + A_LOG + benchmark: + "%s/benchmarks/ASSEMBLY_MT_ASSEMBLY_2.json" % A_OUT + input: + expand('{dir}/{name}', name=[ + 'MG.R1.unmapped.fq', + 'MG.R2.unmapped.fq', + 'MG.SE.unmapped.fq'], dir=A_OUT) + output: + '{dir}/MGMT.assembly_2/final.contigs.fa'.format(dir=A_OUT) + params: + outdir = "{dir}/MGMT.assembly_2".format(dir=A_OUT) + shell: + """ + MAX_READ_LEN=$(cat {input} | sed -n '1~4s/^@/>/p;2~4p' | \ + awk '$0 ~ \">\" {{c=0\"\t\"; }} $0 !~ \">\" {{c+=length($0); max=(max>c)?max:c;}} END {{print max}}') + MEMBYTES=$(({MEMTOTAL}*1000*1000*1000)) + megahit -o {params.outdir} --cpu-only -m $MEMBYTES --mem-flag 1 -l $MAX_READ_LEN \ + --input-cmd "cat {input}" -t {THREADS} --continue > {log} 2>&1 + """ + diff --git a/rules/Assembly/master.rules b/rules/Assembly/master.rules index 6e04446f9ea5440885033f9acdcbbe7c314a23bf..69a28ffb2b566a2fa1cfcb8ea33d985ed3487aa1 100644 --- a/rules/Assembly/master.rules +++ b/rules/Assembly/master.rules @@ -34,3 +34,9 @@ rule ASSEMBLY_MG_ALL: shell: "echo 'MG Assembly Done'" +#rule ASSEMBLY_FAST: +# input: +# "%s/MGMT.assembly.merged.fa" % A_OUT +# shell: +# "echo 'MGMT fast assembly Done'" +#