diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index 827b73d952ae28570bf87aebd6585e1645097ffe..6f0f25a8344588c3800cf3778d2d1232a81e8df4 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,5 +1,5 @@ # absolute path to your desired output path -root: /output/path +root: . ####################### optional prejob - data preparation ####################### # path to tar data diff --git a/.gitignore b/.gitignore index bef92460568ca779f830417295512f9f81ef0fb6..24af5b9310912a013a8c74ce09628f170a31261f 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,8 @@ !Snakefile !*.smk !*.svg +!slurm_logs/ + # 3) add a pattern to track the file patterns of section2 even if they are in # subdirectories diff --git a/README.md b/README.md index 7edeb144199b7c6d1533deb4ba23353eaa95cf01..823d88966ccdac8a6bedba262e50446a727824a8 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This workflow uses [Snakemake](https://snakemake.readthedocs.io/en/stable/) to q A first script (```prejob.sh```) prepares the data until *fasta.gz* files are obtained. A second script (```job.sh```) runs the genome assembly and stats. -doc: [Gitlab pages](https://asm4pg.pages.mia.inra.fr/genomasm4pg) +doc: [Gitlab pages](https://asm4pg.pages.mia.inra.fr/GenomAsm4pg/)  diff --git a/job.sh b/job.sh index 88686b0521f4c2737513bf62636571af8b1039ff..1a5b57e377187b5a889ea5afb6a9758f7d05fe4b 100644 --- a/job.sh +++ b/job.sh @@ -35,12 +35,23 @@ echo 'scontrol show job:' scontrol show job $SLURM_JOB_ID echo '########################################' +## get SNG_BIND abs path using python +function SNG_BIND_ABS_PATH { + SNG_BIND="$(python3 - <<END +import os + +abs_path = os.getcwd() +print(abs_path) + +END +)" +} +SNG_BIND_ABS_PATH ### variables CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" MAX_CORES=10 PROFILE=".config/snakemake_profile/slurm" -SNG_BIND="/gpfs/scratch/sdenni/wf/GenomAsm4pg" ### Module Loading: module purge @@ -48,8 +59,6 @@ module load snakemake/6.5.1 echo 'Starting Snakemake workflow' -### create a log directory for slurm output files -mkdir -p slurm_logs ### Snakemake commands diff --git a/prejob.sh b/prejob.sh index d65a60ed5d574cd026944c172bac183f7e9a6c25..ec05113d016405655dd09530a01af9a8c4dce9a7 100644 --- a/prejob.sh +++ b/prejob.sh @@ -35,12 +35,25 @@ echo 'scontrol show job:' scontrol show job $SLURM_JOB_ID echo '########################################' + +## get SNG_BIND abs path using python +function SNG_BIND_ABS_PATH { + SNG_BIND="$(python3 - <<END +import os + +abs_path = os.getcwd() +print(abs_path) + +END +)" +} +SNG_BIND_ABS_PATH + ### variables CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" MAX_CORES=4 PROFILE=".config/snakemake_profile/slurm" SMK_PATH="workflow/pre-job_snakefiles" -SNG_BIND="/gpfs/scratch/sdenni/wf/GenomAsm4pg" ### Module Loading: module purge diff --git a/workflow/Snakefile b/workflow/Snakefile index 538fbbae76c55fa1c96e555ffa69b818d0b12511..6bc7660156e16a5af983375cfef64dac68835058 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,13 +1,21 @@ configfile: ".config/masterconfig.yaml" -res_path=config["root"] + "/" + config["resdir"] - ###### Include all scripts & rules necessary to run the workflow ###### ### Scripts # get parameters from masterconfig include: "scripts/from_config/hifiasm_mode.py" include: "scripts/from_config/parameters.py" include: "scripts/from_config/target_list.py" +include: "scripts/path_helper.py" + +### paths +if config["root"].startswith("."): + abs_root_path = get_abs_root_path() + res_path = get_res_path() +else: + abs_root_path = config["root"] + res_path = abs_root_path + "/" + config["resdir"] + ### Rules ## PRE ASSEMBLY QC @@ -16,11 +24,11 @@ include: "rules/01_qc.smk" include: "rules/02_asm.smk" # Statistics include: "rules/03_asm_qc.smk" -include: "rules/03.5_A_qc_merqury.smk" +include: "rules/03.5_asm_qc_merqury.smk" # Purging include: "rules/04_purge_dups.smk" include: "rules/05_purged_asm_qc.smk" -include: "rules/05.5_PA_qc_merqury.smk" +include: "rules/05.5_purged_asm_qc_merqury.smk" #Â Link final assembly include: "rules/06_sym_link_hap.smk" ## AUTOMATIC REPORT @@ -39,7 +47,6 @@ BID_RUN = run_BFid(bamIDS) FID_RUN = run_BFid(fastqIDS) ###### results path ###### -res_path=config["root"] + "/" + config["resdir"] ###### Target files ###### ## raw data stats diff --git a/workflow/pre-job_snakefiles/Snakefile1.smk b/workflow/pre-job_snakefiles/Snakefile1.smk index 3314ac7c1b39fb9d80f6f04983765000543cbdb7..da275148fc6061e8d80d99ac93d996247a1b872e 100644 --- a/workflow/pre-job_snakefiles/Snakefile1.smk +++ b/workflow/pre-job_snakefiles/Snakefile1.smk @@ -29,6 +29,15 @@ def data_ext(dir, id): return(str(config["data"] + "/{id}.tar.gz")) ######################## Snakemake ######################## + +### paths +if config["root"].startswith("."): + abs_root_path = get_abs_root_path() + res_path = get_res_path() +else: + abs_root_path = config["root"] + res_path = abs_root_path + "/" + config["resdir"] + ### get filenames for workflow if config["get_all_tar_filename"]: IDS=get_tar_name(config["data"]) @@ -36,7 +45,6 @@ else: IDS=config["tarIDS"] ###### results path ###### -res_path=config["root"] + "/" + config["resdir"] ### target files rule all: @@ -60,9 +68,9 @@ rule extract_targz_data: # move bam and fasta + fastq files rule move_files: params: - root=config["root"], - bam_path=config["root"] + "/" + config["resdir"] + "/" + config["bamdir"], - fastx_path=config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"], + root=abs_root_path, + bam_path=abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"], + fastx_path=abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"], shell: "cd {params.root} && " "mkdir -p {params.bam_path} {params.fastx_path} && " diff --git a/workflow/pre-job_snakefiles/Snakefile2.smk b/workflow/pre-job_snakefiles/Snakefile2.smk index b55585c14009c1cf3cc328dac9789a28fcc405b3..1dfcd0d919003c5c2e96af2e2662e34cf50b533f 100644 --- a/workflow/pre-job_snakefiles/Snakefile2.smk +++ b/workflow/pre-job_snakefiles/Snakefile2.smk @@ -14,26 +14,34 @@ def get_bams_name(dirpath): return(IDS) ######################## Snakemake ######################## + +### root path +if config["root"].startswith("."): + abs_root_path = get_abs_root_path() + res_path = get_res_path() +else: + abs_root_path = config["root"] + res_path = abs_root_path + "/" + config["resdir"] + ###### results path ###### -res_path=config["root"] + "/" + config["resdir"] ### get filenames -IDS=get_bams_name(config["root"] + "/" + config["resdir"] + "/" + config["bamdir"]) +IDS=get_bams_name(abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"]) ### target files rule all: input: - expand(config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz", id=IDS), - expand(config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) + expand(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz", id=IDS), + expand(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) ### rules ## PacBio .bam conversion with smrtlink # .bam.pbi needed for bam_to_ conversion rules rule smrtlink_index: input: - config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam" + abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam" output: - config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam.pbi" + abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam.pbi" container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/smrtlink9.0" shell: @@ -42,12 +50,12 @@ rule smrtlink_index: # convert .bam to .fastq.gz rule smrtlink_bam_to_fastq: input: - bam = config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", + bam = abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", bam_pbi = rules.smrtlink_index.output output: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" params: - prefix=config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" + prefix=abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" priority: 2 container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/smrtlink9.0" @@ -57,12 +65,12 @@ rule smrtlink_bam_to_fastq: # convert .bam to .fasta.gz rule smrtlink_bam_to_fasta: input: - bam = config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", + bam = abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", bam_pbi = rules.smrtlink_index.output output: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" params: - prefix=config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" + prefix=abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" priority: 2 container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/smrtlink9.0" diff --git a/workflow/pre-job_snakefiles/Snakefile3.smk b/workflow/pre-job_snakefiles/Snakefile3.smk index d0ab49218d2657c64f5967fe5f23f38f9d13f876..757ab6b0d3e6088301a9b047298a5ae489518932 100644 --- a/workflow/pre-job_snakefiles/Snakefile3.smk +++ b/workflow/pre-job_snakefiles/Snakefile3.smk @@ -17,26 +17,33 @@ def get_fastq_name(dirpath): return(IDS) ######################## Snakemake ######################## +### root path +if config["root"].startswith("."): + abs_root_path = get_abs_root_path() + res_path = get_res_path() +else: + abs_root_path = config["root"] + res_path = abs_root_path + "/" + config["resdir"] + ###### results path ###### -res_path=config["root"] + "/" + config["resdir"] ### get filenames -IDS = get_fastq_name(config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"]) +IDS = get_fastq_name(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"]) ### target files rule all: input: - expand(config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) + expand(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) ### rules # if only fastq : convert to fasta with seqtk + zip rule convert_to_fasta: input: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" output: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" params: - path=config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + path=abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] threads: 10 container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/seqtk1.3" diff --git a/workflow/rules/00_runtime.smk b/workflow/rules/00_runtime.smk index f53da68ef3c9597cc627ecf606bbfaf069224cdc..2c408ba0576d29b43e39a670e8154d2f7285ffa3 100644 --- a/workflow/rules/00_runtime.smk +++ b/workflow/rules/00_runtime.smk @@ -28,8 +28,6 @@ rule elasped_time: with open(output[0], "w") as out: out.write("Runtime (hh:mm:ss): " + str(td)) - # os.remove(input[0]) - rule elasped_time_trio: input: @@ -49,6 +47,4 @@ rule elasped_time_trio: td = timedelta(seconds=elapsed_time) with open(output[0], "w") as out: - out.write("Runtime (hh:mm:ss): " + str(td)) - - # os.remove(input[0]) \ No newline at end of file + out.write("Runtime (hh:mm:ss): " + str(td)) \ No newline at end of file diff --git a/workflow/rules/01_qc.smk b/workflow/rules/01_qc.smk index 8493080eea31c1041b810dab67fcebf1fb453f08..bf963a2f7f96c023b7f95ca54dc8ac6421bedd57 100644 --- a/workflow/rules/01_qc.smk +++ b/workflow/rules/01_qc.smk @@ -1,7 +1,7 @@ ### QC on .bam files with LongQC rule longqc: input: - config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{Bid}.bam" + abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{Bid}.bam" output: directory(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC") benchmark: diff --git a/workflow/rules/02_asm.smk b/workflow/rules/02_asm.smk index 83c9b2412912b2a02e1cb33d7f21f263214933f1..ee20083a1ea68b2b1a5913d01ba010d1ac8f1280 100644 --- a/workflow/rules/02_asm.smk +++ b/workflow/rules/02_asm.smk @@ -5,12 +5,12 @@ rule hifiasm: input: get_fasta output: - hap1 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap1.p_ctg.gfa", - hap2 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap2.p_ctg.gfa" + hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap1.p_ctg.gfa", + hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap2.p_ctg.gfa" params: - prefix = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" + prefix = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" benchmark: - config["root"] + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_benchmark.txt" + abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_benchmark.txt" threads: 20 resources: mem_mb=250000 @@ -28,12 +28,12 @@ rule hifiasm_hic: # hifi reads hifi = get_fasta output: - hap1 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap1.p_ctg.gfa", - hap2 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap2.p_ctg.gfa" + hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap1.p_ctg.gfa", + hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap2.p_ctg.gfa" params: - prefix= config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" + prefix= abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" benchmark: - config["root"] + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_hic_benchmark.txt" + abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_hic_benchmark.txt" threads: 20 resources: mem_mb=250000 @@ -48,10 +48,10 @@ rule yak: p1 = get_p1, p2 = get_p2 output: - p1 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent1.yak", - p2 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent2.yak" + p1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent1.yak", + p2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent2.yak" benchmark: - config["root"] + "/" + config["resdir"] + "/{runid}/benchmark/{id}_yak_benchmark.txt" + abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_yak_benchmark.txt" container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm0.16.1" shell: @@ -65,12 +65,12 @@ rule hifiasm_trio: p2 = rules.yak.output.p2, child = get_fasta output: - hap1 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap1.p_ctg.gfa", - hap2 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap2.p_ctg.gfa" + hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap1.p_ctg.gfa", + hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap2.p_ctg.gfa" params: - prefix = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" + prefix = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" benchmark: - config["root"] + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_trio_benchmark.txt" + abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_trio_benchmark.txt" threads: 20 resources: mem_mb=250000 @@ -88,8 +88,8 @@ rule hap_gfa_to_fasta: hap1 = get_mode_hap1, hap2 = get_mode_hap2 output: - hap1_fa = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap1.fa.gz", - hap2_fa = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap2.fa.gz" + hap1_fa = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap1.fa.gz", + hap2_fa = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap2.fa.gz" params: pigz_p = config["pigz_threads"] threads: config["pigz_threads"] diff --git a/workflow/rules/03_asm_qc.smk b/workflow/rules/03_asm_qc.smk index c3ea246bfac35181459952f898d8fd8e4795f57f..f32a48053ba91a264d687778e061f1eeacdce878 100644 --- a/workflow/rules/03_asm_qc.smk +++ b/workflow/rules/03_asm_qc.smk @@ -1,5 +1,5 @@ # input haplotypes -HAP_FA_GZ = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap{n}.fa.gz" +HAP_FA_GZ = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap{n}.fa.gz" # unzip fasta rule unzip_hap_fasta: diff --git a/workflow/rules/07_report.smk b/workflow/rules/07_report.smk index a2321652acc1f979f5f9d9ca5c13b564839457d5..e84bc943dee1cc2b7eb78b786238f4e5a35d59bc 100644 --- a/workflow/rules/07_report.smk +++ b/workflow/rules/07_report.smk @@ -1,6 +1,6 @@ ### create report at the end of the workflow + # path variables -res_path=config["root"] + "/" + config["resdir"] RAW_QC = res_path + "/{runid}/01_raw_data_QC" ASM_QC = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC" P_ASM_QC = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC" diff --git a/workflow/scripts/path_helper.py b/workflow/scripts/path_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..1aa7e2ecf4dc98db435e5fd58883a1c8a730413d --- /dev/null +++ b/workflow/scripts/path_helper.py @@ -0,0 +1,13 @@ +import os + +###### root path ###### +def get_abs_root_path(): + abs_root_path = os.path.abspath(config["root"]) + return(abs_root_path) + + +###### results path ###### +def get_res_path(): + abs_root_path = os.path.abspath(config["root"]) + res_path= abs_root_path + "/" + config["resdir"] + return(res_path) \ No newline at end of file