Update Snakefile

54bfb1b1 · Alexis Mergez · 8b1ca53c · 54bfb1b1
Commit 54bfb1b1 authored 11 months ago by Alexis Mergez
--- a/Snakefile
+++ b/Snakefile
 configfile: "config.yaml"

+## Modules
 import os
 import numpy as np
 import gzip
 import re

-# Getting the list of haplotypes (SAMPLES)
+"""
+
+Main variables used in the workflow
+
+"""
+# Retrieving the list of haplotypes (SAMPLES) in data/haplotypes
 SAMPLES = np.unique([
    os.path.basename(f).split('.fa')[0] 
    for f in os.listdir("data/haplotypes/")
    if re.search(r".fa", f)
    ])
+
+# Retrieving the list of haplotypes excluding the reference
 SAMPLES_NOREF = [
    sample 
    for sample in SAMPLES 
    if sample != os.path.basename(config['reference']).split('.fa')[0]
    ]
+
+# Storing the number of haplotypes
 nHAP = len(SAMPLES)

-# Getting the list of chromosomes
+# Retrieving the list of chromosomes from the fasta index of the reference
 with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
    CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]

-# Configuring steps to be done
+# Adjusting the requested outputs according to the config file
 def which_analysis():
    # Creating a list with default analysis steps (to prevent the function from returning an empty list)
    analysis_inputs = [     
@@ -30,25 +40,36 @@ def which_analysis():
        expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth 
        expand("output/chrGraphs.figs/{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs
        "output/pan1c.pggb."+config['name']+".chrGraph.general.stats.tsv" # chromosomes graph statistics
-        ]
+    ]
    
    # Optionals analysis steps
-    if config["get_PAV"] == "True":
+    if config["get_PAV"] == "True": # Adding PAV matrix creation
        analysis_inputs.append("output/pav.matrices")
-    if config["get_allASM_SyRI"] == "True":
+    if config["get_allASM_SyRI"] == "True": # Creating the figure for all assemblies
        analysis_inputs.append("output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png")
-    if config["get_ASMs_SyRI"] == "True":
+    if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each figures 
        analysis_inputs.append(
            expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), # syri for haplotypes
        )
    return analysis_inputs

+"""
+
+Rules
+
+"""
+# Main target rule
 rule all:
    input:
        "output/pan1c.pggb."+config['name']+".gfa", # Final graph (main output)
        "output/pan1c.pggb."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
        which_analysis()

+"""
+
+Tool rules
+
+"""
 rule samtools_index:
    # Samtools faidx to index compressed fasta
    input:
@@ -64,10 +85,10 @@ rule samtools_index:
        "faidx {input.fa}"

 """
-Pre-processing section
-Preparing pggb inputs
-"""

+Pre-processing section : preparing pggb inputs
+
+"""
 rule ragtag_scaffolding:
    # Scaffold input haplotype against the reference to infer chromosome scale sequences
    input:
@@ -91,6 +112,25 @@ rule ragtag_scaffolding:
            -o {output}
        """

+rule clustering:
+    # Read ragtagged fastas and split chromosome sequences into according bins
+    input:
+        expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
+    output:
+        expand('data/chrInputs/{chromosome}.fa.gz', chromosome=CHRLIST)
+    threads: workflow.cores
+    params:
+        apppath=config["app.path"]
+    shell:
+        """
+        mkdir -p $(dirname {output[0]})
+        apptainer run {params.apppath}/pan1c-env.sif python scripts/inputClustering.py \
+            --fasta {input} --output $(dirname {output[0]})
+        for file in $(dirname {output[0]})/*.fa; do
+            apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file
+        done
+        """
+
 rule create_allAsm_syri_fig:
    # Create a SyRI figure containing all input assemblies
    input:
@@ -140,28 +180,10 @@ rule create_sglAsm_syri_fig:
        """

 """
-Core section
-"""

-rule clustering:
-    # Read ragtagged fastas and split chromosome sequences into according bins
-    input:
-        expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
-    output:
-        expand('data/chrInputs/{chromosome}.fa.gz', chromosome=CHRLIST)
-    threads: workflow.cores
-    params:
-        apppath=config["app.path"]
-    shell:
-        """
-        mkdir -p $(dirname {output[0]})
-        apptainer run {params.apppath}/pan1c-env.sif python scripts/inputClustering.py \
-            --fasta {input} --output $(dirname {output[0]})
-        for file in $(dirname {output[0]})/*.fa; do
-            apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file
-        done
-        """
+Core section : Running PGGB

+"""
 rule create_pggb_input_syri_fig:
    input:
        fasta='data/chrInputs/{chromosome}.fa.gz'
@@ -372,12 +394,13 @@ rule core_statistics:
        """

 """
+
 Post-processing section :
    The graph for each chromosome are made as well as some basic statistics.
    In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...).
    It also contains rules to use the graph itself. 
-"""

+"""
 rule get_pav:
    # Create PAV matrix readable by panache for a given chromosome scale graph
    input: