Compare revisions

cecile.triay_ird.fr · alice.boizet_cirad.fr · alice.boizet_cirad.fr · alice.boizet_cirad.fr · Cecile Triay · Cecile Triay
--- a/README.md
+++ b/README.md
+# PopSimul
+
+## Authors
+Cécile Triay (IRD)
+Alice Boizet (CIRAD)
+Mathias Lorieux (IRD)
+Mathieu Triay (Atelier Triay)
+
+## Description
+This tool enables to simulate a VCF of a bi-parental population. It can be used to produce a VCF under controlled conditions with setting parameters such as average depth, marker density and expected error rate. It produces 2 VCF files with the simulated genotypes, one with simulated data, and one with the same simulated data but with simulated genotyping errors. 
+The Breakpoints csv file contains the positions of exact genotypes transitions based on the simulated data. This tool has been used to test NOISYmputer (https://gitlab.cirad.fr/noisymputer/noisymputerstandalone) to compare exact Breakpoints position to Breakpoints positions determined on imputed data.
+
+## Requirements
+python 3
+
+## Running popsimul
+
+``` python popsimul.py -o test1511.vcf -ni 10 -s 44000000 -cM 180 -nm 220000 -a 3 -m 10 -eA 0.03 -eB 0.05```
+
+### Parameters description
+
+You can see all parameters by running this command  
+``` python popsimul.py -h```
+
+| Short  | Long | Description | Default value |
+|---|---|---|---|  
+| -o  | --output | Name for the 3 generated files :  [output].vcf, [output]_errors.vcf, [output]_Breakpoints.csv | popsimul |
+| -ni  | --nb_individuals | number of individuals | 10 |
+| -bp  | --chr_bp | chromosome size | 44000000|
+| -cM | --cM_size | size of genetic map | 180 |
+| -nm  | --nb_markers | number of markers| 220000 |
+| -a  | --average_depth | average depth | 3 |
+| -m  | --max_depth | max depth | 10 |
+| -eA | --error_rate_A | error rate for A | 0.05 |
+| -eB | --error_rate_B | error rate for B | 0.05 |
+
+
--- a/popsimul.py
+++ b/popsimul.py
+import random
+import math
+import numpy as np
+import argparse
+
+#def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions, conversion_factor):
+def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, errA, errB):
+    '''
+    Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).
+    
+    Args:
+        mean_depth (float): The mean sequencing depth of the chromosome
+        markers_positions (lst) : liste of marker positions (picked on a grid)
+        conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size)
+        errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B
+    
+    Returns:
+        segment: An array containing for each snp of the chromosome, this array : [initial_gt, genotype, genotype_error, site_depth, x_error, y_error]        
+        - intial_gt : simulated genotype with no noise and no sequencing error
+        - genotype : simulated genotype with noise 
+        - genotype_error : simulated genotype with noise and sequencing error
+        - site_depth : simulated read depth 
+        - x_error : simulated allelic depth for A
+        - y_error : simulated allelic depth for B
+    '''
+    
+    # Create an empty list to store the chromosome
+    segment = []
+    breakpoints = []
+
+    # Iterate through the list of markers positions (fixed)
+    previous_marker_gt = ""
+    for i in range(0, len(markers_positions)):
+        recomb1 = False
+        recomb2 = False
+        # If the current marker is not the first one
+        if i > 0:
+            # Calculate the interval between the current marker and the previous one
+            #using the Kosambi inverse function to estimate the chance of recombination
+            IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
+            tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor         ## Conversion factor is used to have the bp per cM 
+            IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
+            rnd = random.random()
+
+            # If the random number is greater than the probability of a recombination a recombination occurs
+            # Done for genotype 1
+            if rnd > 1 - IntervalWithPreviousMarkerInRF: 
+                recomb1 = True
+                # If the previous marker is A, change the genotype to B
+                if genotype1 == "A":
+                    genotype1 = "B"
+                else :
+                    genotype1 = "A"
+                    
+            # Done for genotype 2
+            rnd = random.random()
+            if rnd > 1 - IntervalWithPreviousMarkerInRF:
+                recomb2 = True
+                # If the previous marker is B, change the genotype to A
+                if genotype2 == "A":
+                    genotype2 = "B"
+                else :
+                    genotype2 = "A"
+                    
+        # If the current marker is the first one, Set the two first genotypes (random).
+        else:
+            rnd = random.random()
+            # If the random number is greater than 0.5
+            if rnd > 0.5:
+                genotype1 = "A"
+            else : 
+                genotype1 = "B"
+
+            rnd = random.random()
+            if rnd > 0.5:
+                genotype2 = "B"
+            else : 
+                genotype2 = "A"
+            
+            # initialize previous_marker_gt
+            if genotype1 == "A" and genotype2 == "A":
+                previous_marker_gt = "A"
+            elif genotype1 == "B" and genotype2 == "B":
+                previous_marker_gt = "B"
+            else :
+                previous_marker_gt = "H"
+
+        # Breakpoints for this individual
+        if recomb1 or recomb2:
+            #this is a breakpoint [beforeBkpAfter, transition]
+            transition = previous_marker_gt + " => "
+
+            if genotype1 == "A" and genotype2 == "A":
+                previous_marker_gt = "A"
+            elif genotype1 == "B" and genotype2 == "B":
+                previous_marker_gt = "B"
+            else :
+                previous_marker_gt = "H"
+            
+            transition = transition + previous_marker_gt
+        
+            breakpoints.append([i,transition]) 
+
+        # Calculate the site depth of the current marker    
+        g = np.random.poisson(mean_depth)
+        # If the depth of the current marker is less than 0 (safeguard)
+        if g < 0:
+            g = 0
+        # Round the depth of the current marker
+        site_depth = round(g)
+        # Initialize x and y (with and without error)
+        x = 0
+        y = 0
+        x_error = 0
+        y_error = 0
+
+        # If the site is homozygous A (ref, 0/0)
+        if genotype1 == "A" and genotype2 == "A":
+            x = site_depth
+            y = 0
+            initial_gt  = "0/0"
+            genotype = "0/0"
+            x_error = 0
+            y_error = 0
+            # Considering the sequencing and mapping error at each site for a given site depth
+            for j in range(0, site_depth):
+                rnd = random.random()
+                # If the random number is smaller the error rate, we increase the wronge allele depth
+                if rnd < errA:
+                    x_error = x_error + 0
+                    y_error = y_error + 1
+                else : 
+                    x_error = x_error + 1
+                    y_error = y_error + 0
+            if y_error == site_depth:
+                genotype_error = "1/1"
+            elif y_error > 0:
+                genotype_error = "0/1"
+            else :
+                genotype_error = "0/0"
+
+        # If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1)
+        elif genotype1 == "B" and genotype2 == "B":
+            x = 0
+            y = site_depth
+            initial_gt  = "1/1"
+            genotype = "1/1"
+            x_error = 0
+            y_error = 0
+            # Considering the sequencing and mapping error at each site for a given site depth
+            for j in range(0,site_depth):
+                rnd = random.random()
+                # If the random number is smaller the error rate, we increase the wronge allele depth
+                if rnd < errB:
+                    x_error = x_error + 1
+                    y_error = y_error + 0
+                else : 
+                    x_error = x_error + 0
+                    y_error = y_error + 1
+            if x_error == site_depth:
+                genotype_error = "0/0"
+            elif x_error > 0:
+                genotype_error = "0/1"
+            else :
+                genotype_error = "1/1"
+                
+        # If the current marker is neither A nor B, genotype is heterozygous H (0/1)
+        else:
+            initial_gt  = "0/1"
+            # Generate a random number between 0 and the depth of the current marker
+            x = random.randint(0,site_depth)
+            y = site_depth - x
+            # Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error
+            x_error = x
+            y_error = y
+            # If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1)
+            if x == 0:
+                genotype = "1/1"
+                genotype_error = "1/1"
+            # If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0)
+            elif y == 0:
+                genotype = "0/0"
+                genotype_error = "0/0"
+            # If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1)
+            else :
+                genotype = "0/1"
+                genotype_error = "0/1"  
+
+        if site_depth == 0:
+            genotype = "./."
+            genotype_error = "./."
+        
+        # data is an array that stores genotypes and depth for one snp and one individual
+        data = [initial_gt , genotype, genotype_error, site_depth, x_error, y_error]
+
+        segment.append(data)
+
+    return segment, breakpoints
+
+# Generate a list of individuals
+def generate_individuals (nb_individuals, chromosome_size, nb_markers, mean_depth, conversion_factor, errA, errB):
+    matrix = [] #matrix is an array of genotypes for each individual
+    #matrix_error = []
+    
+    # Generate a sorted list of markers positions
+    #markers_positions = sorted(random.sample(range(chromosome_size),size))
+    markers_positions = np.linspace(1, chromosome_size, nb_markers, dtype="int")
+
+    with open(args.output + '_Breakpoints.csv', 'w') as breakpointFile:
+        breakpointFile.write(",".join(["sample","average_bkp_position", "bkp_start_position", "bkp_stop_position", "transitionType"]) + "\n");
+        for i in range(nb_individuals):
+            print("individual " + str(i+1) + " / " + str(nb_individuals), end="\r")
+            bkps_nb = 0
+            result = []
+
+            while bkps_nb < 1 : #resimulate until we get at least one crossing over for this individual
+                result = generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor, errA, errB)
+                bkps_nb = len(result[1])
+
+            matrix.append(result[0])
+
+            for bkp in result[1]:
+                startBkp = markers_positions[bkp[0] - 1]
+                stopBkp = markers_positions[bkp[0]]
+                averageBkpPosition = startBkp + round((stopBkp - startBkp)/2)
+                bkp_row = [str(i), str(averageBkpPosition), str(startBkp), str(stopBkp), bkp[1]]
+                breakpointFile.write(",".join(bkp_row) + "\n");
+    print("The file", args.output + "_Breakpoint.csv" , "has been generated")
+    
+    return matrix, markers_positions
+
+# Write VCF file
+def write_vcf(output_file_name, matrix, gt_data_position, with_depth):
+    # Header to add to the VCF for it to be recognized as such file.    
+    header = [
+    "##fileformat=VCFv4.1\n",
+    "##fileDate=20090805\n",
+    "##source=myImputationProgramV3.1\n",
+    "##reference=/shared/projects/recombinationlandscape/REFERENCE_GENOME/Osat_Azucena_AGI_chrOK_uniline_WithNIPBARorganelles.fasta\n",
+    "##contig=<ID=chr01,length=44011168>\n",
+    "##phasing=partial\n",
+    "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
+    "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n",
+    "##FORMAT=<ID=AD,Number=2,Type=Integer,Description=\"Allelic Depth\">\n"
+    ]
+    nb_individuals = len(matrix)
+    nb_snp = len(matrix[0])
+
+    with open(output_file_name + '.vcf', 'w') as file:
+        # Write the header of the file
+        file.writelines(header)
+        
+        # Write the header of the file
+        file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"]) + "\n")
+        
+        # Iterate over each row in the table
+        for m in range(0, nb_snp):            
+            if with_depth:
+                format_info = "GT:DP:AD"
+            else :
+                format_info = "GT"
+            row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", format_info]
+
+            # add all individual genotypes
+            for i in range(0, nb_individuals):
+                vcf_data = str(matrix[i][m][gt_data_position])
+                if with_depth:
+                    vcf_data = vcf_data + ":" + str(matrix[i][m][3]) + ":" + str(matrix[i][m][4]) + "," + str(matrix[i][m][5])
+                row.append(vcf_data)
+            
+            # add parent genotypes
+            if with_depth:
+                p1_data =  "0/0:"+ str(round(mean_depth)) + ":" + str(round(mean_depth)) + ",0"
+                p2_data =  "1/1:"+ str(round(mean_depth)) + ":0," + str(round(mean_depth))
+            else :
+                p1_data =  "0/0"
+                p2_data =  "1/1"
+
+            row.append(p1_data)
+            row.append(p2_data)
+            file.write("\t".join(row) + "\n")
+
+
+# # Check if a row is correct
+# def is_correct (row):
+#     l = len(row)
+
+#     count_h = row.count("H") / l
+#     count_a = row.count("A") / l
+#     count_b = row.count("B") / l
+    
+#     return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]
+
+
+# define commandline arguments
+parser = argparse.ArgumentParser(description='PopSimul')
+
+parser.add_argument('-o', '--output', help='Output files name', default="popsimul")
+parser.add_argument('-ni', '--nb_individuals', help='Number of individuals', default=10)
+parser.add_argument('-bp', '--chr_bp', help='Chromosome size in bp', default=44000000)
+parser.add_argument('-cM', '--cM_size', help='Size of genetic map', default=180)
+parser.add_argument('-nm', '--nb_markers', help='Number of markers', default=220000)
+parser.add_argument('-a', '--average_depth', help='Depth average', default=3)
+parser.add_argument('-m', '--max_depth', help='Maximum possible depth', default=10)
+parser.add_argument('-eA', '--error_rate_A', help='Error rate', default=0.05)
+parser.add_argument('-eB', '--error_rate_B', help='Error rate', default=0.05)
+
+args = parser.parse_args()
+
+# Set the parameters for the script
+nb_individuals = int(args.nb_individuals)
+chromosome_size = int(args.chr_bp)
+cMsize = int(args.cM_size)    ## Size of genetic map
+conversion_factor = chromosome_size/cMsize   ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division!
+#conversion_factor = 233000
+#marker_density = 255000/44000000
+nb_markers = int(args.nb_markers)
+mean_depth = float(args.average_depth)
+max_depth = float(args.max_depth) #TODO better estimate for max ?
+errA = float(args.error_rate_A)
+errB = float(args.error_rate_B)
+
+print("-----USED PARAMETERS-----")
+print("nb_individuals =",str(nb_individuals))
+print("chr_bp =",str(chromosome_size))
+print("cMsize =",str(cMsize))
+print("nb_markers =",str(nb_markers))
+print("average_depth =",str(mean_depth))
+print("max_depth =",str(max_depth))
+print("error_rate_A =",str(errA))
+print("error_rate_B =",str(errB))
+print("------------------------")
+
+## Generate the pop and associate the results to the matrixes with and without errors
+pop = generate_individuals(nb_individuals, chromosome_size, nb_markers, mean_depth, conversion_factor, errA, errB)
+matrix = pop[0]
+markers_positions = pop[1]
+
+# Header to add to the VCF for it to be recognized as such file.    
+header = [
+"##fileformat=VCFv4.1\n",
+"##fileDate=20090805\n",
+"##source=myImputationProgramV3.1\n",
+"##reference=/shared/projects/recombinationlandscape/REFERENCE_GENOME/Osat_Azucena_AGI_chrOK_uniline_WithNIPBARorganelles.fasta\n",
+"##contig=<ID=chr01,length=44011168>\n",
+"##phasing=partial\n",
+"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
+"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n",
+"##FORMAT=<ID=AD,Number=2,Type=Integer,Description=\"Allelic Depth\">\n"
+]
+
+## Write simulated VCF file with no noise and no sequencing error
+write_vcf(args.output, matrix, 0, True)
+print("The file", args.output + ".vcf has been generated")
+
+## Write simulated VCF file with noise but no sequencing error
+filename = args.output + "_noise"
+write_vcf(filename, matrix, 1, True)
+print("The file", filename + ".vcf has been generated")
+
+## Write simulated VCF file with noise and sequencing error
+filename = args.output + "_noise_errors"
+write_vcf(filename, matrix, 2, True)
+print("The file", filename + ".vcf has been generated")
--- a/sumulation.vba
+++ b/sumulation.vba
No results found