diff --git a/workflow/envs/sortmerna.yaml b/workflow/envs/sortmerna.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1e6a5d7e51ee0ee54bd21f951bc0b37a8e95a525 --- /dev/null +++ b/workflow/envs/sortmerna.yaml @@ -0,0 +1,5 @@ +channels: + - bioconda + - defaults +dependencies: + - sortmerna=4.2.0=0 diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 6ead1b178ad8d307a4e4ea9eaee79d92f66fc6d4..7a8c52559ee7bdbaa1015fae75e99724f4e1b767 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -115,16 +115,16 @@ rule mapping_bwa_idx_assembly: "(date && bwa index {input} -p {params.idx_prefix} && date) &> {log}" # Short reads -rule mapping_bwa_mem_assembly_sr: +rule mapping_bwa_mem_assembly_sr_metag: input: - r1=os.path.join(RESULTS_DIR, "preproc/{mtype}/sr/R1.fastp.fastq.gz"), - r2=os.path.join(RESULTS_DIR, "preproc/{mtype}/sr/R2.fastp.fastq.gz"), + r1=os.path.join(RESULTS_DIR, "preproc/metag/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/metag/sr/R2.fastp.fastq.gz"), asm=os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/{atype}.fasta"), idx=expand(os.path.join(RESULTS_DIR, "assembly/{{rtype}}/{{tool}}/{{atype}}.{ext}"), ext=BWA_IDX_EXT) output: - os.path.join(RESULTS_DIR, "mapping/{mtype}/{rtype}/{tool}/{atype}.sr.bam") + os.path.join(RESULTS_DIR, "mapping/metag/{rtype}/{tool}/{atype}.sr.bam") log: - "logs/bwa.mem.{mtype}.{rtype}.{tool}.{atype}.sr.log", + "logs/bwa.mem.metag.{rtype}.{tool}.{atype}.sr.log", wildcard_constraints: mtype="|".join(META_TYPES), rtype="|".join(READ_TYPES), @@ -146,6 +146,36 @@ rule mapping_bwa_mem_assembly_sr: "samtools sort -@ {threads} -m {config[samtools][sort][chunk_size]} -T {params.bam_prefix} > {output} && " "date) &> {log}" +rule mapping_bwa_mem_assembly_sr_metat: + input: + r1=os.path.join(RESULTS_DIR, "preproc/metat/sr/R1.sortmerna.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/metat/sr/R2.sortmerna.fastq.gz"), + asm=os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/{atype}.fasta"), + idx=expand(os.path.join(RESULTS_DIR, "assembly/{{rtype}}/{{tool}}/{{atype}}.{ext}"), ext=BWA_IDX_EXT) + output: + os.path.join(RESULTS_DIR, "mapping/metat/{rtype}/{tool}/{atype}.sr.bam") + log: + "logs/bwa.mem.metat.{rtype}.{tool}.{atype}.sr.log", + wildcard_constraints: + rtype="|".join(READ_TYPES), + tool="|".join(ASSEMBLERS), + atype="ASSEMBLY|ASSEMBLY.FILTERED" + threads: + config["bwa"]["threads"] + params: + idx_prefix=lambda wildcards, input: os.path.splitext(input.idx[0])[0], + bam_prefix=lambda wildcards, output: os.path.splitext(output[0])[0] + conda: + os.path.join(ENV_DIR, "bwa.yaml") + message: + "Mapping short reads to assembly w/ BWA" + shell: + "(date && " + "bwa mem -t {threads} {params.idx_prefix} {input.r1} {input.r2} | " + "samtools view -@ {threads} -SbT {input.asm} | " + "samtools sort -@ {threads} -m {config[samtools][sort][chunk_size]} -T {params.bam_prefix} > {output} && " + "date) &> {log}" + # Long reads rule mapping_bwa_mem_assembly_lr: input: @@ -196,6 +226,29 @@ rule mapping_bwa_mem_assembly_hy: shell: "(date && samtools merge {output} {input.sr} {input.lr} && date) &> {log}" +# Hybrid: "short + long reads" hybrid, "metaT and metaG" hybrid +# NOTE: such assemblies are not implemented in the pipeline +# but can be included in the analysis by linking to the contigs FASTA +rule mapping_bwa_mem_assembly_hy_hy: + input: + srg=os.path.join(RESULTS_DIR, "mapping/metag/metag/{tool}/{atype}.sr.bam"), + srt=os.path.join(RESULTS_DIR, "mapping/metag/metat/{tool}/{atype}.sr.bam"), + lr=os.path.join(RESULTS_DIR, "mapping/metag/{rtype}/{tool}/{atype}.lr.bam") + output: + os.path.join(RESULTS_DIR, "mapping/metag/{rtype}/{tool}/{atype}.hy.bam") + log: + "logs/bwa_mem.metag.hy.{rtype}.{tool}.{atype}.hy.log" + wildcard_constraints: + tool="|".join(config["assemblers"]["hyhy"]), + atype="ASSEMBLY|ASSEMBLY.FILTERED" + threads: 1 + conda: + os.path.join(ENV_DIR, "bwa.yaml") + message: + "Mapping: merging short-reads and long-reads mapping results" + shell: + "(date && samtools merge {output} {input} && date) &> {log}" + ################################################## # Assembly coverage diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk index 54a7e4832a603dc631748a159f32bcc3123b0cc6..3a729d5c733bf6ccaa3219f2b1879a6539e83e3c 100644 --- a/workflow/rules/preprocessing.smk +++ b/workflow/rules/preprocessing.smk @@ -69,3 +69,36 @@ rule nanostat_guppy_basecalling: "Preprocessing long reads: NanoStats" shell: "(date && NanoStat --fastq {input} --outdir $(dirname {output}) -n $(basename {output}) && date) 2> {log.err} > {log.out}" + +################################################## +# rRNA filtering +rule sortmerna_filt: + input: + r1=os.path.join(RESULTS_DIR, "preproc/metat/sr/R1.fastp.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/metat/sr/R2.fastp.fastq.gz") + output: + r1=os.path.join(RESULTS_DIR, "preproc/metat/sr/R1.sortmerna.fastq.gz"), + r2=os.path.join(RESULTS_DIR, "preproc/metat/sr/R2.sortmerna.fastq.gz"), + r1_o=temp(os.path.join(RESULTS_DIR, "preproc/metat/sr/sortmerna/out/other_fwd.fq")), + r2_o=temp(os.path.join(RESULTS_DIR, "preproc/metat/sr/sortmerna/out/other_rev.fq")), + r1_a=temp(os.path.join(RESULTS_DIR, "preproc/metat/sr/sortmerna/out/aligned_fwd.fq")), + r2_a=temp(os.path.join(RESULTS_DIR, "preproc/metat/sr/sortmerna/out/aligned_rev.fq")) + log: + out="logs/sortmerna.metat.sr.out.log" + threads: + config["sortmerna"]["threads"] + params: + refs=" ".join([ "-ref %s" % ref for ref in config["sortmerna"]["refs"]]) + conda: + os.path.join(ENV_DIR, "sortmerna.yaml") + message: + "Preprocessing metaT short reads: sortmerna" + shell: + "(date && " + "sortmerna {params.refs} -reads {input.r1} -reads {input.r2} -workdir $(dirname {output.r1})/sortmerna " + "--fastx --other -paired_in -v --task 4 --num_alignments 1 --out2 --threads {threads} && " + "gzip -c {output.r1_o} > {output.r1} && " + "gzip -c {output.r2_o} > {output.r2} && " + "gzip -c {output.r1_a} > {output.r1_a}.gz && " + "gzip -c {output.r2_a} > {output.r2_a}.gz && " + "date) &> {log}" diff --git a/workflow/steps/preprocessing.smk b/workflow/steps/preprocessing.smk index 3c01617162a43a89e33024e4bf547e7fcde6e92b..7e6e29ffd89dadfa28c5f6eff6ef16b9dcda691b 100644 --- a/workflow/steps/preprocessing.smk +++ b/workflow/steps/preprocessing.smk @@ -17,6 +17,7 @@ rule PREPROCESSING_LR: rule PREPROCESSING_SR: input: fastp=expand(os.path.join(RESULTS_DIR, "preproc/{mtype}/sr/{rid}.fastp.fastq.gz"), mtype=META_TYPES, rid=["R1", "R2"]), - fastqc=expand(os.path.join(RESULTS_DIR, "qc/{mtype}/sr/{rid}.fastp_{ext}"), mtype=META_TYPES, rid=["R1", "R2"], ext=["fastqc.html", "fastqc.zip"]) + fastqc=expand(os.path.join(RESULTS_DIR, "qc/{mtype}/sr/{rid}.fastp_{ext}"), mtype=META_TYPES, rid=["R1", "R2"], ext=["fastqc.html", "fastqc.zip"]), + sortmerna=expand(os.path.join(RESULTS_DIR, "preproc/metat/sr/{rid}.sortmerna.fastq.gz"), rid=["R1", "R2"]) output: touch("status/preprocessing_sr.done")