From f5df6056cd563eadfbb2682da16a196205c0e61d Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 10 Oct 2023 16:26:44 +0200 Subject: [PATCH 1/9] added SNP --- bed2vcf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bed2vcf.py b/bed2vcf.py index 13d59ad..1506eff 100644 --- a/bed2vcf.py +++ b/bed2vcf.py @@ -67,7 +67,11 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): fasta_seq = fa_dict[chr_name].upper() - if sv_type == "deletion": + if sv_type == "SNP": + ref = str(fasta_seq.seq[start:end]) + alt = sv_info[4] + + elif sv_type == "deletion": end = start + get_random_len("DEL") ref = str(fasta_seq.seq[start:end]) alt = str(fasta_seq.seq[start]) -- GitLab From b11058ff6f7fc691bf5b4b5f1f2eb281fa082d69 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 10 Oct 2023 16:27:02 +0200 Subject: [PATCH 2/9] added script for VISOR bed generation --- VISOR_random_bed/random_bed.sh | 8 ++ VISOR_random_bed/randomregion.r | 222 ++++++++++++++++++++++++++++++++ random_bed.sh | 6 - 3 files changed, 230 insertions(+), 6 deletions(-) create mode 100644 VISOR_random_bed/random_bed.sh create mode 100644 VISOR_random_bed/randomregion.r delete mode 100644 random_bed.sh diff --git a/VISOR_random_bed/random_bed.sh b/VISOR_random_bed/random_bed.sh new file mode 100644 index 0000000..67a2a5c --- /dev/null +++ b/VISOR_random_bed/random_bed.sh @@ -0,0 +1,8 @@ +### create random BED +refFAI="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta.fai" +GENOME="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta" +VISOR_script="randomregion.r" + +# samtools faidx $GENOME +cut -f1,2 $refFAI > chrom.dim.tsv +Rscript $VISOR_script -d chrom.dim.tsv -n 10 -l 2 -v 'SNP,insertion,deletion,inversion' -r '70:10:10:10' -g $GENOME | sortBed > HACk.random.bed \ No newline at end of file diff --git a/VISOR_random_bed/randomregion.r b/VISOR_random_bed/randomregion.r new file mode 100644 index 0000000..64741c4 --- /dev/null +++ b/VISOR_random_bed/randomregion.r @@ -0,0 +1,222 @@ +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager",repos='http://cran.us.r-project.org') +if (!requireNamespace("optparse", quietly = TRUE)) + install.packages("optparse",repos='http://cran.us.r-project.org') +if (!requireNamespace("data.table", quietly = TRUE)) + install.packages("data.table",repos='http://cran.us.r-project.org') +if (!requireNamespace("regioneR", quietly = TRUE)) + BiocManager::install("regioneR") +if (!requireNamespace("Rsamtools", quietly = TRUE)) + BiocManager::install("Rsamtools") + + +suppressPackageStartupMessages(library(optparse)) +suppressPackageStartupMessages(library(data.table)) +suppressPackageStartupMessages(library(regioneR)) +suppressPackageStartupMessages(library(Rsamtools)) + +option_list = list( + make_option(c("-d", "--dimensions"), action="store", type='character', help="TSV file with chromosome dimensions (as result from 'cut -f1,2 genome.fa.fai') [required]"), + make_option(c("-n", "--number"), action="store", default=100, type='numeric', help="number of intervals [100]"), + make_option(c("-l", "--length"), action="store", default=50000, type='numeric', help="mean intervals length [50000]"), + make_option(c("-s", "--standarddev"), action="store", default=0, type='numeric', help="standard deviation for intervals length [0]"), + make_option(c("-x", "--exclude"), action="store", default=NULL, type='character', help="exclude regions in BED [optional]"), + make_option(c("-v", "--variants"), action="store", type='character', help="variants types (-v 'deletion,tandem duplication,inversion') [required]"), + make_option(c("-r", "--ratio"), action="store", type='character', help="variants proportions (-r '30:30:40')) [required]"), + make_option(c("-i", "--idhaplo"), action="store", type='numeric', default=1, help="haplotype number [1]"), + make_option(c("-m", "--microsatellites"), action="store", type='character', default=NULL, help="BED file with microsatellites (ucsc format) [optional]"), + make_option(c("-g", "--genome"), action="store", type="character", default=NULL, help="FASTA genome [optional]") +) + +opt = parse_args(OptionParser(option_list=option_list)) + +possiblevariants<-c('deletion', 'insertion', 'inversion', 'tandem duplication', 'inverted tandem duplication', 'translocation copy-paste', 'translocation cut-paste' , 'reciprocal translocation', 'SNP', 'MNP', 'tandem repeat contraction', 'tandem repeat expansion', 'perfect tandem repetition', 'approximate tandem repetition') +possiblemicro<-c('AAC','AAG','AAT','AC', 'ACA','ACC','ACT','AG','AGA','AGC','AGG','AT','ATA','ATC','ATG','ATT','CA','CAA','CAC','CAG','CAT','CCA','CCG','CCT','CT','CTA','CTC','CTG','CTT','GA','GAA','GAT','GCA','GCC','GGA','GGC','GT','GTG','GTT','TA','TAA','TAC','TAG','TAT','TC','TCA','TCC','TCT','TG','TGA','TGC','TGG','TGT','TTA','TTC','TTG') + +if (is.null(opt$dimensions)) { + stop('-d/--dimensions TSV file is required') +} else { + genome<-fread(file.path(opt$dimensions), sep='\t', header = F) +} + +if (is.null(opt$exclude)) { + exclude<-NULL +} else { + exclude<-fread(file.path(opt$exclude), sep='\t', header=F) +} + +if (is.null(opt$variants)) { + stop('variants in -v/--variants are required') +} + +if (is.null(opt$ratio)) { + stop('variants ratios in -r/--ratio are required') +} + +variants<-unlist(strsplit(opt$variants, ',')) + +if (!all(variants %in% possiblevariants)) { + stop('accepted variants are: ', paste(possiblevariants, collapse=', ')) +} + +testt<-c("tandem repeat contraction", "tandem repeat expansion") + +if (any(testt %in% variants)) { + if (is.null(opt$microsatellites)) { + stop('when adding contractions/expansions of known microsatellites, a BED file specifying known microsatellites must be given') + } else { + microbed<-fread(file.path(opt$microsatellites), sep='\t', header=FALSE) + if (!is.null(exclude)) { + exclude<-data.frame(rbind(exclude[,1:3],data.frame(V1=microbed$V1, V2=microbed$V2-1, V3=microbed$V3+1))) + } else { + exclude<-data.frame(V1=microbed$V1, V2=microbed$V2-2, V3=microbed$V3+2) + } + } +} + +testv<-c("SNP", "MNP") + +if (any(testv %in% variants)) { + if (is.null(opt$genome)) { + stop('when adding SNPs and MNPs, a genome FASTA must be given') + } else { + if (!file.exists(file.path(paste0(opt$genome, '.fai')))) { + indexFa(file.path(opt$genome)) + } + } +} + +number<-as.numeric(unlist(strsplit(opt$ratio, ':'))) + +if (cumsum(number)[length(cumsum(number))] != 100) { + stop('sum of variant ratios must be 100') +} + +if (length(variants) != length(number)) { + stop('for each variant a ratio must be specified') +} + +varwithproportions<-rep(variants, (opt$number/100)*number) + +#the number of regions here is all minus the number of repeat contractions/expansions for which we sample from known regions +keepdist<-varwithproportions[which(varwithproportions %in% testt)] +dft<-data.frame(matrix(NA, nrow = length(keepdist), ncol = 6), stringsAsFactors=FALSE) +colnames(dft)<-c("chromosome","start","end","type","info","breakseqlen") +if (length(keepdist) != 0) { + varwithproportions<-varwithproportions[-which(varwithproportions %in% testt)] #remove + randomicro<-microbed[sample(nrow(microbed), length(keepdist)),] + for (i in 1:nrow(randomicro)) { + dft$chromosome[i]<-randomicro$V1[i] + dft$start[i]<-as.numeric(randomicro$V2[i]) + dft$end[i]<-as.numeric(randomicro$V3[i]) + dft$type[i]<-keepdist[i] + motif<-unlist(strsplit(randomicro$V4[i],'x'))[2] + number_<-as.numeric(unlist(strsplit(randomicro$V4[i],'x'))[1]) + minimum<-1 + if (keepdist[i] == 'tandem repeat contraction') { + maximum<-number_-1 + } else {#is expansion + maximum<-500 + } + dft$info[i]<-paste(motif, sample(minimum:maximum,1),sep=':') + dft$breakseqlen[i]<-0 + } +} + +regions<-createRandomRegions(length(varwithproportions), length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=exclude, non.overlapping=TRUE) + +# just in case we need another list of regions to exclude for translocation +newexclude<-data.frame(rbind(exclude[,1:3], cbind(as.character(seqnames(regions)), as.numeric(start(regions)-2), as.numeric(end(regions)+2)))) + +df <- data.frame(chromosome=seqnames(regions),start=start(regions),end=end(regions), type=varwithproportions, stringsAsFactors = FALSE) + +info<-rep('INFO',nrow(df)) +breakseqlen<-rep(0, nrow(df)) + +for(i in (1:nrow(df))) { + if (df$type[i] == 'deletion') { + info[i]<-'None' + breakseqlen[i]<-sample(0:10,1) + } else if (df$type[i] == 'tandem duplication') { + info[i] <- '2' + breakseqlen[i]<-sample(0:10,1) + } else if (df$type[i] == 'inversion'){ + info[i] <- 'None' + breakseqlen[i]<-sample(0:10,1) + } else if (df$type[i] == 'inverted tandem duplication') { + info[i] <- '2' + } else if (df$type[i] == 'insertion') { + df$start[i]<-df$end[i]-1 + num<-round(rnorm(1, mean=opt$length, sd=opt$standarddev)) #adapt mean and stdev for insertions to those specified by the user + alphabet<-c('A', 'T', 'C', 'G') + motif<-paste(sample(alphabet, num, replace = T), collapse='') + info[i]<-motif + breakseqlen[i]<-sample(0:10,1) + } else if (df$type[i] == 'perfect tandem repetition') { + df$start[i]<-df$end[i]-1 + motif<-sample(possiblemicro,1) + num<-sample(6:500,1) + info[i]<-paste(motif,num,sep=':') + breakseqlen[i]<-0 + } else if (df$type[i] == 'approximate tandem repetition') { + df$start[i]<-df$end[i]-1 + motif<-sample(possiblemicro,1) + num<-sample(6:500,1) + err<-sample(1:round(num/3),1) + info[i]<-paste(motif,num,err,sep=':') + breakseqlen[i]<-0 + } else if (df$type[i] == 'SNP') { + df$start[i]<-df$end[i]-1 + tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$end[i], end=df$end[i])) + nuc<-as.character(scanFa(opt$genome,tog))[[1]] + if (nuc == 'N') { + alphabet<-c('A', 'T', 'C', 'G', 'N') + } else { + alphabet<-c('A', 'T', 'C', 'G') + } + alphabet<-alphabet[-which(alphabet == nuc)] + info[i]<-sample(alphabet,1) + breakseqlen[i]<-0 + } else if(df$type[i] == 'MNP') { + smallseq<-sample(2:10,1) + df$start[i]<-df$end[i]-smallseq + tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$start[i], end=df$end[i])) + nucseq<-as.character(scanFa(opt$genome,tog))[[1]] + for (l in 1:nchar(nucseq)) { + nuc<-substr(nucseq, l, l) + if (nuc == 'N') { + alphabet<-c('A', 'T', 'C', 'G', 'N') + } else { + alphabet<-c('A', 'T', 'C', 'G') + } + alphabet<-alphabet[-which(alphabet == nuc)] + substr(nucseq, l, l)<-sample(alphabet,1) + } + info[i]<-nucseq + breakseqlen[i]<-0 + } else if (df$type[i] == 'translocation cut-paste' | df$type[i] == 'translocation copy-paste') { + newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE) + chromosome<-as.character(seqnames(newregion)) + start<-as.numeric(start(newregion)) + end<-as.numeric(end(newregion)) + orientation<-sample(c('forward', 'reverse'),1) + newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2))) + info[i]<-paste(paste0('h',opt$idhaplo),chromosome,start,orientation,sep=':') + breakseqlen[i]<-sample(0:10,1) + } else { + newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE) + chromosome<-as.character(seqnames(newregion)) + start<-as.numeric(start(newregion)) + end<-as.numeric(end(newregion)) + orientation1<-sample(c('forward', 'reverse'),1) + orientation2<-sample(c('forward', 'reverse'),1) + newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2))) + info[i]<-paste(paste0('h',opt$idhaplo+1),chromosome,start,orientation1, orientation2, sep=':') + breakseqlen[i]<-sample(0:10,1) + } +} + +final<-data.frame(df,info,breakseqlen, stringsAsFactors = FALSE) + +fwrite(rbind(final,dft),file = '',quote = FALSE, col.names = FALSE, row.names = FALSE, sep='\t') + diff --git a/random_bed.sh b/random_bed.sh deleted file mode 100644 index be169a3..0000000 --- a/random_bed.sh +++ /dev/null @@ -1,6 +0,0 @@ -### create random BED - -refFAI="ztIPO323.chr8.fasta.fai" - -cut -f1,2 $refFAI > chrom.dim.tsv -Rscript randomregion.r -d chrom.dim.tsv -n 2638 -v 'deletion,inversion,inverted tandem duplication,translocation copy-paste,reciprocal translocation' -r '40:20:20:10:10' | sortBed > HACk.random.bed \ No newline at end of file -- GitLab From 281f32a275722cecfd9332c4af1ad4432479ee0e Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 10 Oct 2023 16:50:56 +0200 Subject: [PATCH 3/9] move VISOR sv type --- visor_sv_type.yaml => VISOR_random_bed/visor_sv_type.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename visor_sv_type.yaml => VISOR_random_bed/visor_sv_type.yaml (100%) diff --git a/visor_sv_type.yaml b/VISOR_random_bed/visor_sv_type.yaml similarity index 100% rename from visor_sv_type.yaml rename to VISOR_random_bed/visor_sv_type.yaml -- GitLab From 4c6a049bff359003f61dab83833d59b9f9289f29 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 10 Oct 2023 16:51:45 +0200 Subject: [PATCH 4/9] update comment --- VISOR_random_bed/visor_sv_type.yaml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/VISOR_random_bed/visor_sv_type.yaml b/VISOR_random_bed/visor_sv_type.yaml index 5aafc79..557ca6f 100644 --- a/VISOR_random_bed/visor_sv_type.yaml +++ b/VISOR_random_bed/visor_sv_type.yaml @@ -1,10 +1,4 @@ -# possiblevariants<-c('deletion', 'insertion', 'inversion', 'tandem duplication', 'inverted tandem duplication', 'translocation copy-paste', -# 'translocation cut-paste' , 'reciprocal translocation', 'SNP', 'MNP', 'tandem repeat contraction', -# 'tandem repeat expansion', 'perfect tandem repetition', 'approximate tandem repetition') - -# type de SV -# pourcentage (sur le nombre de SV) -# --> récupérer sous forme de dictionnaire ? à convertir en str pour le script randomregion.r +# type de SV possible dans le script randomregion.r de VISOR deletion: 0 insertion: 0 inversion: 0 -- GitLab From 206dbaeb93bd75410f1fc8c4954eb22551a0cc62 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Thu, 12 Oct 2023 16:42:56 +0200 Subject: [PATCH 5/9] added defopt for argument parsing --- main.py | 29 ++++++++++++++++++++--------- tree_generation.py | 12 ++++++++++-- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/main.py b/main.py index 82e2713..da8b303 100644 --- a/main.py +++ b/main.py @@ -1,28 +1,39 @@ from bed2vcf import read_fa, get_seq from fusion import * from tree_generation import msprime_vcf +import defopt ## n : length of variant -def main(bed, vcf, fasta, output_file): - +def sv_vcf(bed: str, vcf: str, fasta: str, outName: str): + """ + Create a VCF with structural variants. + + :parameter bed: BED file of structural variants to include + :parameter vcf: VCF generated with msprime --> tree generation script + :parameter fasta: Reference FASTA for the VCF and the variants + :parameter outName: Output name + + """ bed_df = replace_bed_col(bed, vcf) vcf_df=read_vcf(vcf) fa = read_fa(fasta) + get_seq(vcf_df, bed_df, fa, outName) - get_seq(vcf_df, bed_df, fa, output_file) + header_file="results/vcf_header.txt" + content_file = "results/" + outName + ".vcf" + merged_file="results/" + outName + "_final.vcf" + merge_final_vcf(header_file, content_file, merged_file) + +if __name__ == '__main__': + defopt.run([sv_vcf, msprime_vcf]) bed = "test/mini_random.bed" vcf = "test/output.vcf" fasta = "/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta" output_file="tritici_test" -main(bed, vcf, fasta, output_file) - -header_file="results/vcf_header.txt" -content_file = "results/" + output_file + ".vcf" -merged_file="results/" + output_file + "_final.vcf" -merge_final_vcf(header_file, content_file, merged_file) +# main(bed, vcf, fasta, output_file) ### generate msprime population VCF # fai = "ztIPO323.chr8.fasta.fai" diff --git a/tree_generation.py b/tree_generation.py index 4f740ef..51c3860 100644 --- a/tree_generation.py +++ b/tree_generation.py @@ -33,8 +33,16 @@ def msprime_map(df): # pop_size = population size # mut_rate = mutation rate # n = sample size / number of indiv -def msprime_vcf(input_fai, pop_size, mut_rate, n): - df = pd.read_table(input_fai, header=None, usecols=[0,1], names =["name", "length"]) +def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int): + """ + Generate VCF for each chromosome in reference FASTA. + + :parameter fai: FAI samtools index of reference FASTA + :parameter pop_size: population size (Ne) + :parameter mut_rate: mutation rate (µ) + :parameter n: sample size + """ + df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"]) rate_map=msprime_map(df) ts = msprime.sim_ancestry( -- GitLab From 6090e66842d5559d6a140a55f409c04464d451da Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Thu, 12 Oct 2023 16:43:09 +0200 Subject: [PATCH 6/9] updated readme --- README.md | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 9cfa840..d490c4a 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,42 @@ -Depuis le script `main.py`: -- génération d'une population avec msprime : `msprime_vcf()` - - input : FASTA index, taille de pop, mutation rate, nombre d'individus à simuler - - output : `msprime_output_chr.vcf` -- fusion du BED de VISOR et du VCF de msprime : `main()` - - input : BED pour VISOR, VCF msprime, FASTA, nom du fichier output - - output : VCF +# Generate structural variants in a population +With a population generated by msprime (or any VCF), generate structural variants with VISOR and obtain a final VCF with structural variant sequences and population genotypes. -`tree_generation.py` pour créer les arbres correspondants aux chromosomes. Prend un samtools FAI en entrée, donne un fichier trees et un fichier VCF. +Help for commands is available: +```bash +python main.py -h +python main.py sv-vcf -h +python main.py msprime-vcf -h +``` -`fusion.py` pour fusionner le VCF de msprime, le BED de VISOR et obtenir un VCF final utilisable pour vg. +## 1. Use a VCF as a base +A VCF is used as a base for genotypes and the position and number of variants. This VCF can be create with `msprime` to generate a population with: +```bash +python main.py msprime-vcf <FAI> <POPULATION SIZE> <MUTATION RATE> <SAMPLE SIZE> +``` +This command output as many VCF as there are chromosomes in the reference FAI. -À faire : -- ajouter l'entête au VCF final (reprendre celui du VCF de msprime) -- donner le nombre de variants dans le VCF msprime --> pour créer un BED avec le même nombre de variants -- ajouter les types de variants manquants dans le script `bed2vcf.py` +## 2. Generate structural variants +Using the script `VISOR_random_bed/random_bed.sh`, generate a BED with structural variant informations. All structural variant types handled by VISOR are in the `VISOR_random_bed/visor_sv_type.yaml` file. Modify input in the script: +- generate as many variants as there are in the VCF +- choose structural variant types +- choose structural variant proportions (must equal 100) -Le dossier `sv_distributions` contient les distributions de taille de SV (intervalle de 100 bp). Les données viennent de [An integrated map of structural variation in 2,504 human genomes](https://www.nature.com/articles/nature15394). \ No newline at end of file +[How to use the randomregion.r script](https://davidebolo1993.github.io/visordoc/usecases/usecases.html#visor-hack) + +## 3. Get the final VCF +The final VCF is obtained by merging the msprime VCF, the VISOR BED and the structural variant sequences. +```bash +python main.py sv-vcf <BED> <VCF> <FASTA> <OUTNAME>s +``` + +Each variant type has a size distribution file (bins = 100 bp) in folder `sv_distributions`. The data was extracted from [An integrated map of structural variation in 2,504 human genomes (Sudmant, et al. 2015)](https://www.nature.com/articles/nature15394). + +The distributions are used to randomly sample each structural variant size. + +# TODO +- [ ] Add non supported VISOR variant types + - [ ] MNP + - [ ] tandem repeat contraction + - [ ] tandem repeat expansion + - [ ] approximate tandem repetition +- [ ] Add VCF merging when there are multiple chromosomes \ No newline at end of file -- GitLab From 58e7b5fab27df00255591e03d398467c98d52242 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Wed, 8 Nov 2023 16:55:15 +0100 Subject: [PATCH 7/9] updated readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d490c4a..2c1b4be 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ With a population generated by msprime (or any VCF), generate structural variant Help for commands is available: ```bash python main.py -h -python main.py sv-vcf -h python main.py msprime-vcf -h +python main.py sv-vcf -h ``` ## 1. Use a VCF as a base @@ -26,7 +26,7 @@ Using the script `VISOR_random_bed/random_bed.sh`, generate a BED with structura ## 3. Get the final VCF The final VCF is obtained by merging the msprime VCF, the VISOR BED and the structural variant sequences. ```bash -python main.py sv-vcf <BED> <VCF> <FASTA> <OUTNAME>s +python main.py sv-vcf <BED> <VCF> <FASTA> <OUTNAME> ``` Each variant type has a size distribution file (bins = 100 bp) in folder `sv_distributions`. The data was extracted from [An integrated map of structural variation in 2,504 human genomes (Sudmant, et al. 2015)](https://www.nature.com/articles/nature15394). -- GitLab From ebdffad75ccd2c69bbc56ad791d955e59583fd61 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Wed, 8 Nov 2023 16:55:25 +0100 Subject: [PATCH 8/9] cleanup --- tree_generation.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tree_generation.py b/tree_generation.py index 51c3860..68b67f3 100644 --- a/tree_generation.py +++ b/tree_generation.py @@ -4,7 +4,6 @@ import numpy as np def chrom_pos(df): l=df["length"].to_list() - map_positions=[0] for i in l: map_positions.append(i+map_positions[-1]) @@ -86,10 +85,4 @@ def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int): vcf_file.write(t) vcf_file.close() - print(len(ts_chroms)) - - -# file = "/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/g1.chr8.fasta.fai" -# file2="/home/sukanya/tests/tuto/VISOR/GCF_000001735.4_TAIR10.1_genomic.fna.fai" - -# msprime_vcf(file2, 1000, 1e-8) \ No newline at end of file + print(len(ts_chroms)) \ No newline at end of file -- GitLab From 81a101e0723531154472376856a014c161581fba Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Wed, 8 Nov 2023 16:55:36 +0100 Subject: [PATCH 9/9] remove pycache --- __pycache__/bed2vcf.cpython-39.pyc | Bin 3249 -> 0 bytes __pycache__/bed2vcf_2.cpython-39.pyc | Bin 2901 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 __pycache__/bed2vcf.cpython-39.pyc delete mode 100644 __pycache__/bed2vcf_2.cpython-39.pyc diff --git a/__pycache__/bed2vcf.cpython-39.pyc b/__pycache__/bed2vcf.cpython-39.pyc deleted file mode 100644 index 97e3c74acd2d61f094048df65f53e6f1910da45c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3249 zcmaJ@PjB1E73T~oij-wp{u5cYl|<bXD1@SRlA@P|8`K-e+W?8ZwcTJrU?FfhluTO| z<ss!w46;3B7wBby{RD-*xBUh^_1w=e=%GMQz7^;#m-hEYvSK^!O5nVCkMGa#y&29t ze|oxP;CJ$;e~G`$8OA?}x&Gr|euF0w7=$5M&+wS0+~b;>o(al()^WirFhiKa`r7bp zQ4lt0Q51y(>IheqKwU8<%Ah4NEh?Z>Vn$R!%VO4>7HrRG&0QjrvE5>-uos>@-w7D; z;~&AX@I1wnR57H+Im-a+g6AwFbH42rI)0M+EiOr06{5ZfU$?j_Mg74rZFizx=uOEm zh^1(YC`f-WWC4orezkTGABStn@W}7K@z>HYNt3mQ58FX3!nMvYiQ<0y)zhu^C(kz5 zz6{0V)1b3Dc%zCk^hLYl7YPz!Ggz5P;%*?PEGUddUdKeRbCgY(UmGbhc*{2NTu=tq zwz_G0m2?8DgiJHzi)zj&bzxkvR|YdqxUL=dtf!yuZ)_u5Tl#$wA1gaJh@&8cgs(nS ztgUk$4K`dUcfA|-!`B1(qzv}oc#N~!=Jv+wdM^(AUh-shq5vP(VI1{U0lvE-R><y! z{cd`2+ouWpDL6_9xxpkh*3FEl81zmT!@-1>@QCZ>qk^7>G6}WUADCjvZU3bxX^)jE zFun>?!~BDqPj(A|ND=5QZC`}@##}OVz{~&Ohd5@EO1S{7TjAt9WV3j#;t(I`)G(nR z#W3;cbefoq;$UQN*`ks25uckG&&&>`$+jgeWXqy#JMAPqQANL(>WK-7GQr)#p&SwR z!W7$(Kqj<$2CFiM$$OXu1mNmN^GV`OFc~%*)p!Z}g&7&w93OFl%3y9wT9iu~Xkq=q zAnc2lsmydZ@XTKHh=#|vA-N;Hb}w3j;duL3nEVLb4Mg`LAjG;v{{X(Ll5~yS!c)Lw z<3lj%L-@}8qLvpkcF42hl94-^L&$R%6qxf8C>AfD0<}QPD60!PolPH_SxNA^kxhL= znw7D{tQ+8Fva%=~aru2#IJeS5HuIKYtk4qas#sT2;!YaSwxRv^)Xt`|QXLxy{w9_; zEw1~~mc=9f9g~gs`qwqmB1MI_NyO`KPa3In?g}R>UyzSKq?9l6!^TBsd3B_lRTqqW z78px9(&z9w3!k&^BYuDo??;)ABTP{QVAS68xoqwnXY&Qi=Miz9Dhz)Mn4!jET1WZo z1bJChAmMTyXs*>Uzcb?V8sW(9Tw%y+dgh#$FN}OKTgVm#UpMsXEQrzS)L~hFzasfk zws=0B)w888*S0qNEz(TacLvc-$!|v=d!!w(fE}jSR_}!6iRQBYB>y))+CNu#R$#HB z@r*3$s03C)SAxsGT~%VDrDFm0?HvDq7T2TA{K&dZb|^AuhuDLeC92=@yphfvRx>+m z9L|ba@OQz_WsU4^my0<uPa1eP-!;U-_Y4{g7Dm><s@}^Q=g3ps5w$MM?kyM>CT{M< ztRd>_hNyuqjp;Jz3it--UC?`2wT8HlI#7)<BmVG?Xo{bRkMMu+tp!hXqWpApqDcDk zFX^<rq*F<sRh}F5alX@0MpHWNX*wl_gI*N)B)_EN>ug28OheI3ahrw5&1)Lobc$;z zp<9X!-m#DML4CG(L);98=`RN82g0*D?)+#V=~ipr&_SorAZT|n;Or{~ac_9sPxdh{ zN8Nrb!}h3$u#|~IQCaDmL8z>#--%V}OI-g)1Kf(K;*n5n^)i4P&pH|UJ!PZWNJA-q zP9py4J_NBm_R}+Sb!}DIiGMukg{pvVMW$!Pc$luDsd&fSEjoCUaG>l1G#79{rEArc ziThSk+k@yxoODiP6TGD3SKqU`2#U)P?n7S&2g)RDg#@Kdm7T`DC;<&$r<98ACkawi zd?1v&>8Jjd^p8XNGYVk`aUZo+mXI-!jFyZ%w1ecdbzj-Q+7=yUqZtDAiXC6~Q;PL< zDRoiNX~kb9Z5a6i)rk~OPL&xPNcl@b+@e4%^=#NuE-NHlCLs?7o}HZP8xbvBCr(CH zbW+*Y_41tY&H2c4##iSf&+dm`wUg7yyT_Avk3E|#!xOJWcB8rCz+dPSM8*3b+-}8g zsIO$Y&5}Q3Ft}q{%;FZW@Dg`$_d1;ZbF5-Itj4OCmsy3?%@V8Oech_?DkP)}sa3}N z5--CV)S|n)#7Y)<sKB<u%38V%8<In2k|w-D6K^4@@VW^(o72*T5eAK;T9A7{c((oE zK@tyT5dQkr^SzzjCfe0z8aL6omL5F)?e5M?_p_Zn_xYx~`)teIc(L#P?)Mun+^y#? z_Mh##&$qXB+^wD6mmB-;-p0$%UOa16ls&-5=O=Ol?&ZfcxEMV1lPK0Lwr+Mun~z#B zy#w`kuG;{O$}7=k8E<B4gKswy&iZjIhQ06!H6RJOagb%%axBNx|AON>e>9f<3wH|X AcmMzZ diff --git a/__pycache__/bed2vcf_2.cpython-39.pyc b/__pycache__/bed2vcf_2.cpython-39.pyc deleted file mode 100644 index e5df67ebe5d1102d81fed96e9dde767ff85e9398..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2901 zcma)8&2QYs6`vV?EXn=QYQ2_LR#I$hpuiTWWav_$P>m{*)e5jziRAh~AXu;*uC(%M zxf)WkBXH4!lUrN(kUu~==AL8!5}pd=q)U(>Hy_&m-f*>+)F226&YSo6=6&-$2Gy#^ za4o<6UVc5#*f)gSyabRx;z=YH$s`{#pKB<5p`qa$fMRGKTfW7aG^F`C^KEHK8?Yox z(gAd&D?LD0mSqLdlT}#*EXz4r2dv0>zbg4Y>ol&QiFG?XxAvoxo!yWVKE22s6VE1| zq>d$J7d(Smmtw**Ixf1t)en+1=m<sJxiuKc=ygZr-e5F7O?&;pF!IYP3S%XEauB9p zF=c`A-S1Wp<Kt*GIei(7J_}aUC`psmM~`}8ETh%_X)=gMy%(F?y^o%3t^PHVYiD7< zx3)6=EH9}jkiC9TBBRJU#w%PAcFT71gA{4oTi8f`fxt=QbC$w~H+&1vCHY`>^V>(S z-A-^Sp`#i5v_9eFykwXB1>@{Q=;#UG-2Bt?jV}DPRWOqAac+l)@gR&q5i5^!-qYR| ziw#k^yM7RjqSs^fxB~PCJnVd7tGltXK8(X)m^@yYF+hfS6c0wZ1=$A?a%2yq(Ls86 zSEhmUWe7^ha>i8y>;DCV-sxaC*iaH*3Y|a-=vkzaNK5^S=RCP9-y0;oDN_xHuhlI4 zn3`F=kP#^qy`chzx-X0+rWIcO7Z>yxNvc#6TzAx|d!XU#nCT%rG_YJ>lvl#FJ8f(V zJy`i0zQ`s<fltiL$jm;u$-9aY*|EsmelLkm@=`ENY2%wJe)T#xK+g@(?pTvMG8#rH z%8^V>D2I&Kxg*psunTno5)BvaXYRLwWGpj|ScY2Zi**<`#ROrJf?;%hWADkfZ)`k$ zj`WHk?U?EVoKzoTflcwN+@{Kx{kvwBlPE<}`aPKZ0L;@*p~ls(fZYlg2_(C9xV8|u zX?TnSHZk!r81yl`6`%fRVrBeDWY!X!*qJRkU<ptFI)F&v#04|~y-Z{!z;aeTGBQVs z7R%f(h_f=K7%c{@nt9TCDbznxGpoMgSSvb3wE~~aoixE=gX3SRmARSILcvTb)3;Nu z*{OoN1nzGOj@g@9YLn#alQTL_q@@c-I+=GV-g32-k_YY*l_hMR)C<n6-ei+`SmM1T zuHQh~JfzJRS|;!902KvxEwxNiRkM&dX=IHHPgXAZ<Q}x#qj*6^6FZ8_eMIa&-~w<& zP*%03FR$%}?iwgUEfhLu9?T+@u_8AY<x4hM%<gB4lB24c5|PH9Dr^C{&^Yw8Afa_5 z;!9ZzTqd}d0A0XENURjKu+B!`!Gbw$!M}9|n>wdWyw@4ixqa~Cwt@Pfh|4VtmO)?E z78Gw=@EnoXo>N7<&sf1#RL0WJ@bjlV`SmR;s)fxra#-k@ORHHeo69V;C=(sMDeLHp z^N0(R4OmGPjUC}fHF-}q<$c@>{{XN4b{D&fX3UjdsJUKIOVVAzcLyW%Q)(KtrS3}G zvOFCR2Vp?;D{2(y%lc&+$##nA5goU0aQM{}Z9k!@KnLFO-|NPBzIcn=4o}nHjxm9w z^A@J;V6140v~Tg`e8m?#Te<P^?*4ft##E+hyMn&*j=S3(1D!Y;=k_7yYNQl-(V5E) zj2cBVKQ{-kfp|{T5_w3CU7L~%73Fpk1u8tujln3*tppCHxt+$tL6Sm79O`yNlwO#e zbv{tPB#|~II(nL4>IeF#Am@k<nqATZUm#`92f0Yjaw9xc5Sj)G!lDuAsc+*9*|JP# zQ0KeRFhw}_?c_}Ncl?R;;{?_N_)e;NT8wXF2dP>$#H{6|QS_<@-M%xO3~RnK9Sv)m zn`VSZ???s|)AT78J<v-BXvCmD1UdOT7A73q6gA-qhvOD>H+ao(Fg85H5fxAygyDU| ztcf}(1#iQwfY-zS44tXb-oU6aYqn!lL`|P?jJn|&4@8~UEus(?R>&l|^cH(i>IU|f z+Jrsl-G>j8_*8|_A71S2@9wqnPt{K2Hh!<Xhns)i+kNJuGTfal7qiI4C~`mf%f?f8 zd*|u%CwuNrcYD{}-rak)@!Z|tc=qwrC!Jbuk5R8dl8doakDx-iSbXE7L9BlR_3v0w zDD;z{t3`hqx*D`mzDILyT66D1s&_iB&iZjIPlwTC`oIzrLjkOqj_JTP+_vL7e`8Dk E13r$1>Hq)$ -- GitLab