Skip to content
Snippets Groups Projects
Commit 54bfb1b1 authored by Alexis Mergez's avatar Alexis Mergez
Browse files

Update Snakefile

parent 8b1ca53c
No related branches found
No related tags found
1 merge request!12v1.7
Pipeline #184152 failed
configfile: "config.yaml"
## Modules
import os
import numpy as np
import gzip
import re
# Getting the list of haplotypes (SAMPLES)
"""
Main variables used in the workflow
"""
# Retrieving the list of haplotypes (SAMPLES) in data/haplotypes
SAMPLES = np.unique([
os.path.basename(f).split('.fa')[0]
for f in os.listdir("data/haplotypes/")
if re.search(r".fa", f)
])
# Retrieving the list of haplotypes excluding the reference
SAMPLES_NOREF = [
sample
for sample in SAMPLES
if sample != os.path.basename(config['reference']).split('.fa')[0]
]
# Storing the number of haplotypes
nHAP = len(SAMPLES)
# Getting the list of chromosomes
# Retrieving the list of chromosomes from the fasta index of the reference
with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]
# Configuring steps to be done
# Adjusting the requested outputs according to the config file
def which_analysis():
# Creating a list with default analysis steps (to prevent the function from returning an empty list)
analysis_inputs = [
......@@ -30,25 +40,36 @@ def which_analysis():
expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth
expand("output/chrGraphs.figs/{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs
"output/pan1c.pggb."+config['name']+".chrGraph.general.stats.tsv" # chromosomes graph statistics
]
]
# Optionals analysis steps
if config["get_PAV"] == "True":
if config["get_PAV"] == "True": # Adding PAV matrix creation
analysis_inputs.append("output/pav.matrices")
if config["get_allASM_SyRI"] == "True":
if config["get_allASM_SyRI"] == "True": # Creating the figure for all assemblies
analysis_inputs.append("output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png")
if config["get_ASMs_SyRI"] == "True":
if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each figures
analysis_inputs.append(
expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), # syri for haplotypes
)
return analysis_inputs
"""
Rules
"""
# Main target rule
rule all:
input:
"output/pan1c.pggb."+config['name']+".gfa", # Final graph (main output)
"output/pan1c.pggb."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
which_analysis()
"""
Tool rules
"""
rule samtools_index:
# Samtools faidx to index compressed fasta
input:
......@@ -64,10 +85,10 @@ rule samtools_index:
"faidx {input.fa}"
"""
Pre-processing section
Preparing pggb inputs
"""
Pre-processing section : preparing pggb inputs
"""
rule ragtag_scaffolding:
# Scaffold input haplotype against the reference to infer chromosome scale sequences
input:
......@@ -91,6 +112,25 @@ rule ragtag_scaffolding:
-o {output}
"""
rule clustering:
# Read ragtagged fastas and split chromosome sequences into according bins
input:
expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
output:
expand('data/chrInputs/{chromosome}.fa.gz', chromosome=CHRLIST)
threads: workflow.cores
params:
apppath=config["app.path"]
shell:
"""
mkdir -p $(dirname {output[0]})
apptainer run {params.apppath}/pan1c-env.sif python scripts/inputClustering.py \
--fasta {input} --output $(dirname {output[0]})
for file in $(dirname {output[0]})/*.fa; do
apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file
done
"""
rule create_allAsm_syri_fig:
# Create a SyRI figure containing all input assemblies
input:
......@@ -140,28 +180,10 @@ rule create_sglAsm_syri_fig:
"""
"""
Core section
"""
rule clustering:
# Read ragtagged fastas and split chromosome sequences into according bins
input:
expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
output:
expand('data/chrInputs/{chromosome}.fa.gz', chromosome=CHRLIST)
threads: workflow.cores
params:
apppath=config["app.path"]
shell:
"""
mkdir -p $(dirname {output[0]})
apptainer run {params.apppath}/pan1c-env.sif python scripts/inputClustering.py \
--fasta {input} --output $(dirname {output[0]})
for file in $(dirname {output[0]})/*.fa; do
apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file
done
"""
Core section : Running PGGB
"""
rule create_pggb_input_syri_fig:
input:
fasta='data/chrInputs/{chromosome}.fa.gz'
......@@ -372,12 +394,13 @@ rule core_statistics:
"""
"""
Post-processing section :
The graph for each chromosome are made as well as some basic statistics.
In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...).
It also contains rules to use the graph itself.
"""
"""
rule get_pav:
# Create PAV matrix readable by panache for a given chromosome scale graph
input:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment