From f5df6056cd563eadfbb2682da16a196205c0e61d Mon Sep 17 00:00:00 2001
From: "sukanya.denni" <sukanya.denni@inrae.fr>
Date: Tue, 10 Oct 2023 16:26:44 +0200
Subject: [PATCH 1/9] added SNP

---
 bed2vcf.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/bed2vcf.py b/bed2vcf.py
index 13d59ad..1506eff 100644
--- a/bed2vcf.py
+++ b/bed2vcf.py
@@ -67,7 +67,11 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file):
 
 		fasta_seq = fa_dict[chr_name].upper()
 
-		if sv_type == "deletion":
+		if sv_type == "SNP":
+			ref = str(fasta_seq.seq[start:end])
+			alt = sv_info[4]
+
+		elif sv_type == "deletion":
 			end = start + get_random_len("DEL")
 			ref = str(fasta_seq.seq[start:end])
 			alt = str(fasta_seq.seq[start])
-- 
GitLab


From b11058ff6f7fc691bf5b4b5f1f2eb281fa082d69 Mon Sep 17 00:00:00 2001
From: "sukanya.denni" <sukanya.denni@inrae.fr>
Date: Tue, 10 Oct 2023 16:27:02 +0200
Subject: [PATCH 2/9] added script for VISOR bed generation

---
 VISOR_random_bed/random_bed.sh  |   8 ++
 VISOR_random_bed/randomregion.r | 222 ++++++++++++++++++++++++++++++++
 random_bed.sh                   |   6 -
 3 files changed, 230 insertions(+), 6 deletions(-)
 create mode 100644 VISOR_random_bed/random_bed.sh
 create mode 100644 VISOR_random_bed/randomregion.r
 delete mode 100644 random_bed.sh

diff --git a/VISOR_random_bed/random_bed.sh b/VISOR_random_bed/random_bed.sh
new file mode 100644
index 0000000..67a2a5c
--- /dev/null
+++ b/VISOR_random_bed/random_bed.sh
@@ -0,0 +1,8 @@
+### create random BED
+refFAI="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta.fai"
+GENOME="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta"
+VISOR_script="randomregion.r"
+
+# samtools faidx $GENOME
+cut -f1,2 $refFAI > chrom.dim.tsv
+Rscript $VISOR_script -d chrom.dim.tsv -n 10 -l 2 -v 'SNP,insertion,deletion,inversion' -r '70:10:10:10' -g $GENOME | sortBed > HACk.random.bed
\ No newline at end of file
diff --git a/VISOR_random_bed/randomregion.r b/VISOR_random_bed/randomregion.r
new file mode 100644
index 0000000..64741c4
--- /dev/null
+++ b/VISOR_random_bed/randomregion.r
@@ -0,0 +1,222 @@
+if (!requireNamespace("BiocManager", quietly = TRUE))
+  install.packages("BiocManager",repos='http://cran.us.r-project.org')
+if (!requireNamespace("optparse", quietly = TRUE))
+  install.packages("optparse",repos='http://cran.us.r-project.org')
+if (!requireNamespace("data.table", quietly = TRUE))
+  install.packages("data.table",repos='http://cran.us.r-project.org') 
+if (!requireNamespace("regioneR", quietly = TRUE))
+  BiocManager::install("regioneR")
+if (!requireNamespace("Rsamtools", quietly = TRUE))
+  BiocManager::install("Rsamtools")
+
+
+suppressPackageStartupMessages(library(optparse))
+suppressPackageStartupMessages(library(data.table))
+suppressPackageStartupMessages(library(regioneR))
+suppressPackageStartupMessages(library(Rsamtools))
+
+option_list = list(
+  make_option(c("-d", "--dimensions"), action="store", type='character', help="TSV file with chromosome dimensions (as result from 'cut -f1,2 genome.fa.fai') [required]"),
+  make_option(c("-n", "--number"), action="store", default=100, type='numeric', help="number of intervals [100]"),
+  make_option(c("-l", "--length"), action="store", default=50000, type='numeric', help="mean intervals length [50000]"),
+  make_option(c("-s", "--standarddev"), action="store", default=0, type='numeric', help="standard deviation for intervals length [0]"),
+  make_option(c("-x", "--exclude"), action="store", default=NULL, type='character', help="exclude regions in BED [optional]"),
+  make_option(c("-v", "--variants"), action="store", type='character', help="variants types (-v 'deletion,tandem duplication,inversion') [required]"),
+  make_option(c("-r", "--ratio"), action="store", type='character', help="variants proportions (-r '30:30:40')) [required]"),
+  make_option(c("-i", "--idhaplo"), action="store", type='numeric', default=1, help="haplotype number [1]"),
+  make_option(c("-m", "--microsatellites"), action="store", type='character', default=NULL, help="BED file with microsatellites (ucsc format) [optional]"),
+  make_option(c("-g", "--genome"), action="store", type="character", default=NULL, help="FASTA genome [optional]")
+)
+
+opt = parse_args(OptionParser(option_list=option_list))
+
+possiblevariants<-c('deletion', 'insertion', 'inversion', 'tandem duplication', 'inverted tandem duplication', 'translocation copy-paste', 'translocation cut-paste' ,  'reciprocal translocation', 'SNP', 'MNP', 'tandem repeat contraction', 'tandem repeat expansion', 'perfect tandem repetition', 'approximate tandem repetition')
+possiblemicro<-c('AAC','AAG','AAT','AC', 'ACA','ACC','ACT','AG','AGA','AGC','AGG','AT','ATA','ATC','ATG','ATT','CA','CAA','CAC','CAG','CAT','CCA','CCG','CCT','CT','CTA','CTC','CTG','CTT','GA','GAA','GAT','GCA','GCC','GGA','GGC','GT','GTG','GTT','TA','TAA','TAC','TAG','TAT','TC','TCA','TCC','TCT','TG','TGA','TGC','TGG','TGT','TTA','TTC','TTG')
+
+if (is.null(opt$dimensions)) {
+  stop('-d/--dimensions TSV file is required')
+} else {
+  genome<-fread(file.path(opt$dimensions), sep='\t', header = F)
+}
+
+if (is.null(opt$exclude)) {
+  exclude<-NULL
+} else {
+  exclude<-fread(file.path(opt$exclude), sep='\t', header=F)
+}
+
+if (is.null(opt$variants)) {
+  stop('variants in -v/--variants are required')
+}
+
+if (is.null(opt$ratio)) {
+  stop('variants ratios in -r/--ratio are required')
+}
+
+variants<-unlist(strsplit(opt$variants, ','))
+
+if (!all(variants %in% possiblevariants)) {
+  stop('accepted variants are: ', paste(possiblevariants, collapse=', '))
+}
+
+testt<-c("tandem repeat contraction", "tandem repeat expansion")
+
+if (any(testt %in% variants)) {
+  if (is.null(opt$microsatellites)) {
+    stop('when adding contractions/expansions of known microsatellites, a BED file specifying known microsatellites must be given')
+  } else {
+    microbed<-fread(file.path(opt$microsatellites), sep='\t', header=FALSE)
+    if (!is.null(exclude)) {
+      exclude<-data.frame(rbind(exclude[,1:3],data.frame(V1=microbed$V1, V2=microbed$V2-1, V3=microbed$V3+1)))
+    } else {
+      exclude<-data.frame(V1=microbed$V1, V2=microbed$V2-2, V3=microbed$V3+2)
+    }
+  }
+}
+
+testv<-c("SNP", "MNP")
+
+if (any(testv %in% variants)) {
+  if (is.null(opt$genome)) {
+    stop('when adding SNPs and MNPs, a genome FASTA must be given')
+  } else {
+    if (!file.exists(file.path(paste0(opt$genome, '.fai')))) {
+      indexFa(file.path(opt$genome))
+    }
+  }
+}
+
+number<-as.numeric(unlist(strsplit(opt$ratio, ':')))
+
+if (cumsum(number)[length(cumsum(number))] != 100) {
+  stop('sum of variant ratios must be 100')
+}
+
+if (length(variants) != length(number)) {
+  stop('for each variant a ratio must be specified')
+}
+
+varwithproportions<-rep(variants, (opt$number/100)*number)
+
+#the number of regions here is all minus the number of repeat contractions/expansions for which we sample from known regions
+keepdist<-varwithproportions[which(varwithproportions %in% testt)]
+dft<-data.frame(matrix(NA, nrow = length(keepdist), ncol = 6), stringsAsFactors=FALSE)
+colnames(dft)<-c("chromosome","start","end","type","info","breakseqlen")
+if (length(keepdist) != 0) {
+  varwithproportions<-varwithproportions[-which(varwithproportions %in% testt)] #remove
+  randomicro<-microbed[sample(nrow(microbed), length(keepdist)),]
+  for (i in 1:nrow(randomicro)) {
+    dft$chromosome[i]<-randomicro$V1[i]
+    dft$start[i]<-as.numeric(randomicro$V2[i])
+    dft$end[i]<-as.numeric(randomicro$V3[i])
+    dft$type[i]<-keepdist[i]
+    motif<-unlist(strsplit(randomicro$V4[i],'x'))[2]
+    number_<-as.numeric(unlist(strsplit(randomicro$V4[i],'x'))[1])
+    minimum<-1
+    if (keepdist[i] == 'tandem repeat contraction') {
+      maximum<-number_-1
+    } else {#is expansion
+      maximum<-500
+    }
+    dft$info[i]<-paste(motif, sample(minimum:maximum,1),sep=':')
+    dft$breakseqlen[i]<-0
+  }
+}
+
+regions<-createRandomRegions(length(varwithproportions), length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=exclude, non.overlapping=TRUE)
+
+# just in case we need another list of regions to exclude for translocation
+newexclude<-data.frame(rbind(exclude[,1:3], cbind(as.character(seqnames(regions)), as.numeric(start(regions)-2), as.numeric(end(regions)+2))))
+
+df <- data.frame(chromosome=seqnames(regions),start=start(regions),end=end(regions), type=varwithproportions, stringsAsFactors = FALSE)
+
+info<-rep('INFO',nrow(df))
+breakseqlen<-rep(0, nrow(df))
+
+for(i in (1:nrow(df))) {
+  if (df$type[i] == 'deletion') {
+    info[i]<-'None'
+    breakseqlen[i]<-sample(0:10,1)
+  } else if (df$type[i] == 'tandem duplication') {
+    info[i] <- '2'
+    breakseqlen[i]<-sample(0:10,1)
+  } else if (df$type[i] == 'inversion'){
+    info[i] <- 'None'
+    breakseqlen[i]<-sample(0:10,1)
+  } else if (df$type[i] == 'inverted tandem duplication') {
+    info[i] <- '2'
+  } else if (df$type[i] == 'insertion') {
+    df$start[i]<-df$end[i]-1
+    num<-round(rnorm(1, mean=opt$length, sd=opt$standarddev)) #adapt mean and stdev for insertions to those specified by the user
+    alphabet<-c('A', 'T', 'C', 'G')
+    motif<-paste(sample(alphabet, num, replace = T), collapse='')
+    info[i]<-motif
+    breakseqlen[i]<-sample(0:10,1)    
+  } else if (df$type[i] == 'perfect tandem repetition') {
+    df$start[i]<-df$end[i]-1
+    motif<-sample(possiblemicro,1)
+    num<-sample(6:500,1)
+    info[i]<-paste(motif,num,sep=':')
+    breakseqlen[i]<-0
+  } else if (df$type[i] == 'approximate tandem repetition') {
+    df$start[i]<-df$end[i]-1
+    motif<-sample(possiblemicro,1)
+    num<-sample(6:500,1)
+    err<-sample(1:round(num/3),1)
+    info[i]<-paste(motif,num,err,sep=':')
+    breakseqlen[i]<-0
+  } else if (df$type[i] == 'SNP') {
+    df$start[i]<-df$end[i]-1
+    tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$end[i], end=df$end[i]))
+    nuc<-as.character(scanFa(opt$genome,tog))[[1]]
+    if (nuc == 'N') {
+      alphabet<-c('A', 'T', 'C', 'G', 'N')
+    } else {
+      alphabet<-c('A', 'T', 'C', 'G')
+    }
+    alphabet<-alphabet[-which(alphabet == nuc)]
+    info[i]<-sample(alphabet,1)
+    breakseqlen[i]<-0
+  } else if(df$type[i] == 'MNP') {
+    smallseq<-sample(2:10,1)
+    df$start[i]<-df$end[i]-smallseq
+    tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$start[i], end=df$end[i]))
+    nucseq<-as.character(scanFa(opt$genome,tog))[[1]]
+    for (l in 1:nchar(nucseq)) {
+      nuc<-substr(nucseq, l, l)
+      if (nuc == 'N') {
+        alphabet<-c('A', 'T', 'C', 'G', 'N')
+      } else {
+        alphabet<-c('A', 'T', 'C', 'G')
+      }
+      alphabet<-alphabet[-which(alphabet == nuc)]
+      substr(nucseq, l, l)<-sample(alphabet,1)
+    }
+    info[i]<-nucseq
+    breakseqlen[i]<-0
+  } else if (df$type[i] == 'translocation cut-paste' | df$type[i] == 'translocation copy-paste') {
+    newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE)
+    chromosome<-as.character(seqnames(newregion))
+    start<-as.numeric(start(newregion))
+    end<-as.numeric(end(newregion))
+    orientation<-sample(c('forward', 'reverse'),1)
+    newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2)))
+    info[i]<-paste(paste0('h',opt$idhaplo),chromosome,start,orientation,sep=':')
+    breakseqlen[i]<-sample(0:10,1) 
+  } else {
+    newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE)
+    chromosome<-as.character(seqnames(newregion))
+    start<-as.numeric(start(newregion))
+    end<-as.numeric(end(newregion))
+    orientation1<-sample(c('forward', 'reverse'),1)
+    orientation2<-sample(c('forward', 'reverse'),1)
+    newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2)))
+    info[i]<-paste(paste0('h',opt$idhaplo+1),chromosome,start,orientation1, orientation2, sep=':')
+    breakseqlen[i]<-sample(0:10,1) 
+  }
+} 
+
+final<-data.frame(df,info,breakseqlen, stringsAsFactors = FALSE)
+
+fwrite(rbind(final,dft),file = '',quote = FALSE, col.names = FALSE, row.names = FALSE, sep='\t')
+
diff --git a/random_bed.sh b/random_bed.sh
deleted file mode 100644
index be169a3..0000000
--- a/random_bed.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-### create random BED
-
-refFAI="ztIPO323.chr8.fasta.fai"
-
-cut -f1,2 $refFAI > chrom.dim.tsv
-Rscript randomregion.r -d chrom.dim.tsv -n 2638 -v 'deletion,inversion,inverted tandem duplication,translocation copy-paste,reciprocal translocation' -r '40:20:20:10:10' | sortBed > HACk.random.bed
\ No newline at end of file
-- 
GitLab


From 281f32a275722cecfd9332c4af1ad4432479ee0e Mon Sep 17 00:00:00 2001
From: "sukanya.denni" <sukanya.denni@inrae.fr>
Date: Tue, 10 Oct 2023 16:50:56 +0200
Subject: [PATCH 3/9] move VISOR sv type

---
 visor_sv_type.yaml => VISOR_random_bed/visor_sv_type.yaml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename visor_sv_type.yaml => VISOR_random_bed/visor_sv_type.yaml (100%)

diff --git a/visor_sv_type.yaml b/VISOR_random_bed/visor_sv_type.yaml
similarity index 100%
rename from visor_sv_type.yaml
rename to VISOR_random_bed/visor_sv_type.yaml
-- 
GitLab


From 4c6a049bff359003f61dab83833d59b9f9289f29 Mon Sep 17 00:00:00 2001
From: "sukanya.denni" <sukanya.denni@inrae.fr>
Date: Tue, 10 Oct 2023 16:51:45 +0200
Subject: [PATCH 4/9] update comment

---
 VISOR_random_bed/visor_sv_type.yaml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/VISOR_random_bed/visor_sv_type.yaml b/VISOR_random_bed/visor_sv_type.yaml
index 5aafc79..557ca6f 100644
--- a/VISOR_random_bed/visor_sv_type.yaml
+++ b/VISOR_random_bed/visor_sv_type.yaml
@@ -1,10 +1,4 @@
-# possiblevariants<-c('deletion', 'insertion', 'inversion', 'tandem duplication', 'inverted tandem duplication', 'translocation copy-paste', 
-# 'translocation cut-paste' ,  'reciprocal translocation', 'SNP', 'MNP', 'tandem repeat contraction', 
-# 'tandem repeat expansion', 'perfect tandem repetition', 'approximate tandem repetition')
-
-# type de SV
-# pourcentage (sur le nombre de SV)
-# --> rÃ©cupÃ©rer sous forme de dictionnaire ? Ã  convertir en str pour le script randomregion.r
+# type de SV possible dans le script randomregion.r de VISOR
 deletion: 0
 insertion: 0
 inversion: 0
-- 
GitLab


From 206dbaeb93bd75410f1fc8c4954eb22551a0cc62 Mon Sep 17 00:00:00 2001
From: "sukanya.denni" <sukanya.denni@inrae.fr>
Date: Thu, 12 Oct 2023 16:42:56 +0200
Subject: [PATCH 5/9] added defopt for argument parsing

---
 main.py            | 29 ++++++++++++++++++++---------
 tree_generation.py | 12 ++++++++++--
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/main.py b/main.py
index 82e2713..da8b303 100644
--- a/main.py
+++ b/main.py
@@ -1,28 +1,39 @@
 from bed2vcf import read_fa, get_seq
 from fusion import *
 from tree_generation import msprime_vcf
+import defopt
 
 ## n : length of variant
-def main(bed, vcf, fasta, output_file):
-	
+def sv_vcf(bed: str, vcf: str, fasta: str, outName: str):
+	"""
+    Create a VCF with structural variants.
+    
+    :parameter bed: BED file of structural variants to include
+    :parameter vcf: VCF generated with msprime --> tree generation script
+	:parameter fasta: Reference FASTA for the VCF and the variants
+	:parameter outName: Output name
+    
+    """
 	bed_df = replace_bed_col(bed, vcf)
 	vcf_df=read_vcf(vcf)
 	fa = read_fa(fasta)
 
+	get_seq(vcf_df, bed_df, fa, outName)
 
-	get_seq(vcf_df, bed_df, fa, output_file)
+	header_file="results/vcf_header.txt"
+	content_file = "results/" + outName + ".vcf"
+	merged_file="results/" + outName + "_final.vcf"
+	merge_final_vcf(header_file, content_file, merged_file)
+
+if __name__ == '__main__':
+    defopt.run([sv_vcf, msprime_vcf])
 
 bed = "test/mini_random.bed"
 vcf = "test/output.vcf"
 fasta = "/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta"
 
 output_file="tritici_test"
-main(bed, vcf, fasta, output_file)
-
-header_file="results/vcf_header.txt"
-content_file = "results/" + output_file + ".vcf"
-merged_file="results/" + output_file + "_final.vcf"
-merge_final_vcf(header_file, content_file, merged_file)
+# main(bed, vcf, fasta, output_file)
 
 ### generate msprime population VCF
 # fai = "ztIPO323.chr8.fasta.fai"
diff --git a/tree_generation.py b/tree_generation.py
index 4f740ef..51c3860 100644
--- a/tree_generation.py
+++ b/tree_generation.py
@@ -33,8 +33,16 @@ def msprime_map(df):
 # pop_size = population size
 # mut_rate = mutation rate
 # n = sample size / number of indiv
-def msprime_vcf(input_fai, pop_size, mut_rate, n):
-    df = pd.read_table(input_fai, header=None, usecols=[0,1], names =["name", "length"])
+def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int):
+    """
+    Generate VCF for each chromosome in reference FASTA.
+
+    :parameter fai: FAI samtools index of reference FASTA
+    :parameter pop_size: population size (Ne)
+    :parameter mut_rate: mutation rate (Âµ)
+    :parameter n: sample size
+    """
+    df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"])
     rate_map=msprime_map(df)
 
     ts = msprime.sim_ancestry(
-- 
GitLab


From 6090e66842d5559d6a140a55f409c04464d451da Mon Sep 17 00:00:00 2001
From: "sukanya.denni" <sukanya.denni@inrae.fr>
Date: Thu, 12 Oct 2023 16:43:09 +0200
Subject: [PATCH 6/9] updated readme

---
 README.md | 52 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 9cfa840..d490c4a 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,42 @@
-Depuis le script `main.py`:
-- gÃ©nÃ©ration d'une population avec msprime : `msprime_vcf()`
-    - input : FASTA index, taille de pop, mutation rate, nombre d'individus Ã  simuler
-    - output : `msprime_output_chr.vcf`
-- fusion du BED de VISOR et du VCF de msprime : `main()`
-    - input : BED pour VISOR, VCF msprime, FASTA, nom du fichier output
-    - output : VCF
+# Generate structural variants in a population
+With a population generated by msprime (or any VCF), generate structural variants with VISOR and obtain a final VCF with structural variant sequences and population genotypes.
 
-`tree_generation.py` pour crÃ©er les arbres correspondants aux chromosomes. Prend un samtools FAI en entrÃ©e, donne un fichier trees et un fichier VCF.
+Help for commands is available:
+```bash
+python main.py -h
+python main.py sv-vcf -h
+python main.py msprime-vcf -h
+```
 
-`fusion.py` pour fusionner le VCF de msprime, le BED de VISOR et obtenir un VCF final utilisable pour vg.
+## 1. Use a VCF as a base
+A VCF is used as a base for genotypes and the position and number of variants. This VCF can be create with `msprime` to generate a population with:
+```bash
+python main.py msprime-vcf <FAI> <POPULATION SIZE> <MUTATION RATE> <SAMPLE SIZE> 
+```
+This command output as many VCF as there are chromosomes in the reference FAI.
 
-Ã€ faire :
-- ajouter l'entÃªte au VCF final (reprendre celui du VCF de msprime)
-- donner le nombre de variants dans le VCF msprime --> pour crÃ©er un BED avec le mÃªme nombre de variants
-- ajouter les types de variants manquants dans le script `bed2vcf.py`
+## 2. Generate structural variants
+Using the script `VISOR_random_bed/random_bed.sh`, generate a BED with structural variant informations. All structural variant types handled by VISOR are in the `VISOR_random_bed/visor_sv_type.yaml` file. Modify input in the script:
+- generate as many variants as there are in the VCF
+- choose structural variant types
+- choose structural variant proportions (must equal 100)
 
-Le dossier `sv_distributions` contient les distributions de taille de SV (intervalle de 100 bp). Les donnÃ©es viennent de [An integrated map of structural variation in 2,504 human genomes](https://www.nature.com/articles/nature15394).
\ No newline at end of file
+[How to use the randomregion.r script](https://davidebolo1993.github.io/visordoc/usecases/usecases.html#visor-hack)
+
+## 3. Get the final VCF
+The final VCF is obtained by merging the msprime VCF, the VISOR BED and the structural variant sequences.
+```bash
+python main.py sv-vcf <BED> <VCF> <FASTA> <OUTNAME>s
+```
+
+Each variant type has a size distribution file (bins = 100 bp) in folder `sv_distributions`. The data was extracted from [An integrated map of structural variation in 2,504 human genomes (Sudmant, et al. 2015)](https://www.nature.com/articles/nature15394).
+
+The distributions are used to randomly sample each structural variant size.
+
+# TODO
+- [ ] Add non supported VISOR variant types
+    - [ ] MNP
+    - [ ] tandem repeat contraction
+    - [ ] tandem repeat expansion
+    - [ ] approximate tandem repetition
+- [ ] Add VCF merging when there are multiple chromosomes
\ No newline at end of file
-- 
GitLab


From 58e7b5fab27df00255591e03d398467c98d52242 Mon Sep 17 00:00:00 2001
From: "sukanya.denni" <sukanya.denni@inrae.fr>
Date: Wed, 8 Nov 2023 16:55:15 +0100
Subject: [PATCH 7/9] updated readme

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d490c4a..2c1b4be 100644
--- a/README.md
+++ b/README.md
@@ -4,8 +4,8 @@ With a population generated by msprime (or any VCF), generate structural variant
 Help for commands is available:
 ```bash
 python main.py -h
-python main.py sv-vcf -h
 python main.py msprime-vcf -h
+python main.py sv-vcf -h
 ```
 
 ## 1. Use a VCF as a base
@@ -26,7 +26,7 @@ Using the script `VISOR_random_bed/random_bed.sh`, generate a BED with structura
 ## 3. Get the final VCF
 The final VCF is obtained by merging the msprime VCF, the VISOR BED and the structural variant sequences.
 ```bash
-python main.py sv-vcf <BED> <VCF> <FASTA> <OUTNAME>s
+python main.py sv-vcf <BED> <VCF> <FASTA> <OUTNAME>
 ```
 
 Each variant type has a size distribution file (bins = 100 bp) in folder `sv_distributions`. The data was extracted from [An integrated map of structural variation in 2,504 human genomes (Sudmant, et al. 2015)](https://www.nature.com/articles/nature15394).
-- 
GitLab


From ebdffad75ccd2c69bbc56ad791d955e59583fd61 Mon Sep 17 00:00:00 2001
From: "sukanya.denni" <sukanya.denni@inrae.fr>
Date: Wed, 8 Nov 2023 16:55:25 +0100
Subject: [PATCH 8/9] cleanup

---
 tree_generation.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tree_generation.py b/tree_generation.py
index 51c3860..68b67f3 100644
--- a/tree_generation.py
+++ b/tree_generation.py
@@ -4,7 +4,6 @@ import numpy as np
 
 def chrom_pos(df):
     l=df["length"].to_list()
-
     map_positions=[0]
     for i in l:
         map_positions.append(i+map_positions[-1])
@@ -86,10 +85,4 @@ def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int):
                 vcf_file.write(t)
             vcf_file.close()
 
-    print(len(ts_chroms))
-
-
-# file = "/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/g1.chr8.fasta.fai"
-# file2="/home/sukanya/tests/tuto/VISOR/GCF_000001735.4_TAIR10.1_genomic.fna.fai"
-
-# msprime_vcf(file2, 1000, 1e-8)
\ No newline at end of file
+    print(len(ts_chroms))
\ No newline at end of file
-- 
GitLab


From 81a101e0723531154472376856a014c161581fba Mon Sep 17 00:00:00 2001
From: "sukanya.denni" <sukanya.denni@inrae.fr>
Date: Wed, 8 Nov 2023 16:55:36 +0100
Subject: [PATCH 9/9] remove pycache

---
 __pycache__/bed2vcf.cpython-39.pyc   | Bin 3249 -> 0 bytes
 __pycache__/bed2vcf_2.cpython-39.pyc | Bin 2901 -> 0 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 __pycache__/bed2vcf.cpython-39.pyc
 delete mode 100644 __pycache__/bed2vcf_2.cpython-39.pyc

diff --git a/__pycache__/bed2vcf.cpython-39.pyc b/__pycache__/bed2vcf.cpython-39.pyc
deleted file mode 100644
index 97e3c74acd2d61f094048df65f53e6f1910da45c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3249
zcmaJ@PjB1E73T~oij-wp{u5cYl|<bXD1@SRlA@P|8`K-e+W?8ZwcTJrU?FfhluTO|
z<ss!w46;3B7wBby{RD-*xBUh^_1w=e=%GMQz7^;#m-hEYvSK^!O5nVCkMGa#y&29t
ze|oxP;CJ$;e~G`$8OA?}x&Gr|euF0w7=$5M&+wS0+~b;>o(al()^WirFhiKa`r7bp
zQ4lt0Q51y(>IheqKwU8<%Ah4NEh?Z>Vn$R!%VO4>7HrRG&0QjrvE5>-uos>@-w7D;
z;~&AX@I1wnR57H+Im-a+g6AwFbH42rI)0M+EiOr06{5ZfU$?j_Mg74rZFizx=uOEm
zh^1(YC`f-WWC4orezkTGABStn@W}7K@z>HYNt3mQ58FX3!nMvYiQ<0y)zhu^C(kz5
zz6{0V)1b3Dc%zCk^hLYl7YPz!Ggz5P;%*?PEGUddUdKeRbCgY(UmGbhc*{2NTu=tq
zwz_G0m2?8DgiJHzi)zj&bzxkvR|YdqxUL=dtf!yuZ)_u5Tl#$wA1gaJh@&8cgs(nS
ztgUk$4K`dUcfA|-!`B1(qzv}oc#N~!=Jv+wdM^(AUh-shq5vP(VI1{U0lvE-R><y!
z{cd`2+ouWpDL6_9xxpkh*3FEl81zmT!@-1>@QCZ>qk^7>G6}WUADCjvZU3bxX^)jE
zFun>?!~BDqPj(A|ND=5QZC`}@##}OVz{~&Ohd5@EO1S{7TjAt9WV3j#;t(I`)G(nR
z#W3;cbefoq;$UQN*`ks25uckG&&&>`$+jgeWXqy#JMAPqQANL(>WK-7GQr)#p&SwR
z!W7$(Kqj<$2CFiM$$OXu1mNmN^GV`OFc~%*)p!Z}g&7&w93OFl%3y9wT9iu~Xkq=q
zAnc2lsmydZ@XTKHh=#|vA-N;Hb}w3j;duL3nEVLb4Mg`LAjG;v{{X(Ll5~yS!c)Lw
z<3lj%L-@}8qLvpkcF42hl94-^L&$R%6qxf8C>AfD0<}QPD60!PolPH_SxNA^kxhL=
znw7D{tQ+8Fva%=~aru2#IJeS5HuIKYtk4qas#sT2;!YaSwxRv^)Xt`|QXLxy{w9_;
zEw1~~mc=9f9g~gs`qwqmB1MI_NyO`KPa3In?g}R>UyzSKq?9l6!^TBsd3B_lRTqqW
z78px9(&z9w3!k&^BYuDo??;)ABTP{QVAS68xoqwnXY&Qi=Miz9Dhz)Mn4!jET1WZo
z1bJChAmMTyXs*>Uzcb?V8sW(9Tw%y+dgh#$FN}OKTgVm#UpMsXEQrzS)L~hFzasfk
zws=0B)w888*S0qNEz(TacLvc-$!|v=d!!w(fE}jSR_}!6iRQBYB>y))+CNu#R$#HB
z@r*3$s03C)SAxsGT~%VDrDFm0?HvDq7T2TA{K&dZb|^AuhuDLeC92=@yphfvRx>+m
z9L|ba@OQz_WsU4^my0<uPa1eP-!;U-_Y4{g7Dm><s@}^Q=g3ps5w$MM?kyM>CT{M<
ztRd>_hNyuqjp;Jz3it--UC?`2wT8HlI#7)<BmVG?Xo{bRkMMu+tp!hXqWpApqDcDk
zFX^<rq*F<sRh}F5alX@0MpHWNX*wl_gI*N)B)_EN>ug28OheI3ahrw5&1)Lobc$;z
zp<9X!-m#DML4CG(L);98=`RN82g0*D?)+#V=~ipr&_SorAZT|n;Or{~ac_9sPxdh{
zN8Nrb!}h3$u#|~IQCaDmL8z>#--%V}OI-g)1Kf(K;*n5n^)i4P&pH|UJ!PZWNJA-q
zP9py4J_NBm_R}+Sb!}DIiGMukg{pvVMW$!Pc$luDsd&fSEjoCUaG>l1G#79{rEArc
ziThSk+k@yxoODiP6TGD3SKqU`2#U)P?n7S&2g)RDg#@Kdm7T`DC;<&$r<98ACkawi
zd?1v&>8Jjd^p8XNGYVk`aUZo+mXI-!jFyZ%w1ecdbzj-Q+7=yUqZtDAiXC6~Q;PL<
zDRoiNX~kb9Z5a6i)rk~OPL&xPNcl@b+@e4%^=#NuE-NHlCLs?7o}HZP8xbvBCr(CH
zbW+*Y_41tY&H2c4##iSf&+dm`wUg7yyT_Avk3E|#!xOJWcB8rCz+dPSM8*3b+-}8g
zsIO$Y&5}Q3Ft}q{%;FZW@Dg`$_d1;ZbF5-Itj4OCmsy3?%@V8Oech_?DkP)}sa3}N
z5--CV)S|n)#7Y)<sKB<u%38V%8<In2k|w-D6K^4@@VW^(o72*T5eAK;T9A7{c((oE
zK@tyT5dQkr^SzzjCfe0z8aL6omL5F)?e5M?_p_Zn_xYx~`)teIc(L#P?)Mun+^y#?
z_Mh##&$qXB+^wD6mmB-;-p0$%UOa16ls&-5=O=Ol?&ZfcxEMV1lPK0Lwr+Mun~z#B
zy#w`kuG;{O$}7=k8E<B4gKswy&iZjIhQ06!H6RJOagb%%axBNx|AON>e>9f<3wH|X
AcmMzZ

diff --git a/__pycache__/bed2vcf_2.cpython-39.pyc b/__pycache__/bed2vcf_2.cpython-39.pyc
deleted file mode 100644
index e5df67ebe5d1102d81fed96e9dde767ff85e9398..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2901
zcma)8&2QYs6`vV?EXn=QYQ2_LR#I$hpuiTWWav_$P>m{*)e5jziRAh~AXu;*uC(%M
zxf)WkBXH4!lUrN(kUu~==AL8!5}pd=q)U(>Hy_&m-f*>+)F226&YSo6=6&-$2Gy#^
za4o<6UVc5#*f)gSyabRx;z=YH$s`{#pKB<5p`qa$fMRGKTfW7aG^F`C^KEHK8?Yox
z(gAd&D?LD0mSqLdlT}#*EXz4r2dv0>zbg4Y>ol&QiFG?XxAvoxo!yWVKE22s6VE1|
zq>d$J7d(Smmtw**Ixf1t)en+1=m<sJxiuKc=ygZr-e5F7O?&;pF!IYP3S%XEauB9p
zF=c`A-S1Wp<Kt*GIei(7J_}aUC`psmM~`}8ETh%_X)=gMy%(F?y^o%3t^PHVYiD7<
zx3)6=EH9}jkiC9TBBRJU#w%PAcFT71gA{4oTi8f`fxt=QbC$w~H+&1vCHY`>^V>(S
z-A-^Sp`#i5v_9eFykwXB1>@{Q=;#UG-2Bt?jV}DPRWOqAac+l)@gR&q5i5^!-qYR|
ziw#k^yM7RjqSs^fxB~PCJnVd7tGltXK8(X)m^@yYF+hfS6c0wZ1=$A?a%2yq(Ls86
zSEhmUWe7^ha>i8y>;DCV-sxaC*iaH*3Y|a-=vkzaNK5^S=RCP9-y0;oDN_xHuhlI4
zn3`F=kP#^qy`chzx-X0+rWIcO7Z>yxNvc#6TzAx|d!XU#nCT%rG_YJ>lvl#FJ8f(V
zJy`i0zQ`s<fltiL$jm;u$-9aY*|EsmelLkm@=`ENY2%wJe)T#xK+g@(?pTvMG8#rH
z%8^V>D2I&Kxg*psunTno5)BvaXYRLwWGpj|ScY2Zi**<`#ROrJf?;%hWADkfZ)`k$
zj`WHk?U?EVoKzoTflcwN+@{Kx{kvwBlPE<}`aPKZ0L;@*p~ls(fZYlg2_(C9xV8|u
zX?TnSHZk!r81yl`6`%fRVrBeDWY!X!*qJRkU<ptFI)F&v#04|~y-Z{!z;aeTGBQVs
z7R%f(h_f=K7%c{@nt9TCDbznxGpoMgSSvb3wE~~aoixE=gX3SRmARSILcvTb)3;Nu
z*{OoN1nzGOj@g@9YLn#alQTL_q@@c-I+=GV-g32-k_YY*l_hMR)C<n6-ei+`SmM1T
zuHQh~JfzJRS|;!902KvxEwxNiRkM&dX=IHHPgXAZ<Q}x#qj*6^6FZ8_eMIa&-~w<&
zP*%03FR$%}?iwgUEfhLu9?T+@u_8AY<x4hM%<gB4lB24c5|PH9Dr^C{&^Yw8Afa_5
z;!9ZzTqd}d0A0XENURjKu+B!`!Gbw$!M}9|n>wdWyw@4ixqa~Cwt@Pfh|4VtmO)?E
z78Gw=@EnoXo>N7<&sf1#RL0WJ@bjlV`SmR;s)fxra#-k@ORHHeo69V;C=(sMDeLHp
z^N0(R4OmGPjUC}fHF-}q<$c@>{{XN4b{D&fX3UjdsJUKIOVVAzcLyW%Q)(KtrS3}G
zvOFCR2Vp?;D{2(y%lc&+$##nA5goU0aQM{}Z9k!@KnLFO-|NPBzIcn=4o}nHjxm9w
z^A@J;V6140v~Tg`e8m?#Te<P^?*4ft##E+hyMn&*j=S3(1D!Y;=k_7yYNQl-(V5E)
zj2cBVKQ{-kfp|{T5_w3CU7L~%73Fpk1u8tujln3*tppCHxt+$tL6Sm79O`yNlwO#e
zbv{tPB#|~II(nL4>IeF#Am@k<nqATZUm#`92f0Yjaw9xc5Sj)G!lDuAsc+*9*|JP#
zQ0KeRFhw}_?c_}Ncl?R;;{?_N_)e;NT8wXF2dP>$#H{6|QS_<@-M%xO3~RnK9Sv)m
zn`VSZ???s|)AT78J<v-BXvCmD1UdOT7A73q6gA-qhvOD>H+ao(Fg85H5fxAygyDU|
ztcf}(1#iQwfY-zS44tXb-oU6aYqn!lL`|P?jJn|&4@8~UEus(?R>&l|^cH(i>IU|f
z+Jrsl-G>j8_*8|_A71S2@9wqnPt{K2Hh!<Xhns)i+kNJuGTfal7qiI4C~`mf%f?f8
zd*|u%CwuNrcYD{}-rak)@!Z|tc=qwrC!Jbuk5R8dl8doakDx-iSbXE7L9BlR_3v0w
zDD;z{t3`hqx*D`mzDILyT66D1s&_iB&iZjIPlwTC`oIzrLjkOqj_JTP+_vL7e`8Dk
E13r$1>Hq)$

-- 
GitLab