Merge branch 'errorrates' into 'master'

Adding Error rates to noisyplotype.py See merge request noisymputer/popsimul!2

Merge branch 'errorrates' into 'master'
Adding Error rates to noisyplotype.py See merge request noisymputer/popsimul!2
bf407962 · alice.boizet_cirad.fr · 6d8b8423 · c0f82cd6 · bf407962
Commit bf407962 authored 1 year ago by alice.boizet_cirad.fr
--- a/noisyplotype.py
+++ b/noisyplotype.py
 import random
-import csv
 import math
 import numpy as np
 

 #def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions, conversion_factor):
-def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor):
+def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, errA, errB):
    '''
    Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).
    
    Args:
-        chromosome_size (int): The size of the chromosome.
-        marker_density (float): The density of markers.
-        err_rate (float): The error rate of the chromosome.
-        mean_depth (float): The mean depth of the chromosome.
-        max_depth
-        markers_positions
+        mean_depth (float): The mean sequencing depth of the chromosome
+        markers_positions (lst) : liste of marker positions (picked on a grid)
+        conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size)
+        errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B
    
    Returns:
-        list: The chromosome.
+        segment: A chromosome with each position having a given genotype with a given site depth and allele depth
+        segment_error : A chromosome with each position having a given genotype with a given site depth and allele depth considering the error rates errA and errB
    '''
-
+    
+    # Create an empty list to store the chromosome
    segment = []
+    # Create an empty list to store the chromosome with error
+    segment_error = []
+    breakpoints = []

-    # Iterate through the list of markers positions
+    # Iterate through the list of markers positions (fixed)
+    previousMarkerGenotype = ""
    for i in range(0, len(markers_positions)):
+        recomb1 = False
+        recomb2 = False
        # If the current marker is not the first one
        if i > 0:
            # Calculate the interval between the current marker and the previous one
-            #TODO 
-            #Kosambi inverse function
+            #using the Kosambi inverse function to estimate the chance of recombination
            IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
-            tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor 
+            tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor         ## Conversion factor is used to have the bp per cM 
            IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
            rnd = random.random()
-            # If the random number is greater than the probability of a recombination
-            #print("before",genotype1, genotype2)
-            if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs
+
+            # If the random number is greater than the probability of a recombination a recombination occurs
+            # Done for genotype 1
+            if rnd > 1 - IntervalWithPreviousMarkerInRF: 
+                recomb1 = True
                # If the previous marker is A, change the genotype to B
                if genotype1 == "A":
-                    genotype1 = "B";
+                    genotype1 = "B"
                else :
-                    genotype1 = "A";
+                    genotype1 = "A"
+                    
+            # Done for genotype 2
            rnd = random.random()
            if rnd > 1 - IntervalWithPreviousMarkerInRF:
+                recomb2 = True
                # If the previous marker is B, change the genotype to A
                if genotype2 == "A":
-                    genotype2 = "B";
+                    genotype2 = "B"
                else :
-                    genotype2 = "A";
-            #print("after",genotype1, genotype2)
-        # If the current marker is the first one, fix the two first genotypes.
+                    genotype2 = "A"
+                    
+        # If the current marker is the first one, Set the two first genotypes (random).
        else:
            rnd = random.random()
            # If the random number is greater than 0.5
@@ -64,72 +73,157 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor):
                genotype2 = "B"
            else : 
                genotype2 = "A"
+            
+            # initialize previousMarkerGenotype
+            if genotype1 == "A" and genotype2 == "A":
+                previousMarkerGenotype = "A"
+            elif genotype1 == "B" and genotype2 == "B":
+                previousMarkerGenotype = "B"
+            else :
+                previousMarkerGenotype = "H"

+        #Breakpoints
+        if recomb1 or recomb2:
+            #this is a breakpoint [beforeBkpAfter, transition]
+            transition = previousMarkerGenotype + " => "

-        #TODO : ajouter erreur de séquençage
+            if genotype1 == "A" and genotype2 == "A":
+                previousMarkerGenotype = "A"
+            elif genotype1 == "B" and genotype2 == "B":
+                previousMarkerGenotype = "B"
+            else :
+                previousMarkerGenotype = "H"
+            
+            transition = transition + previousMarkerGenotype
        
-        # Calculate the depth of the current marker        
+            breakpoints.append([i,transition]) 
+
+        # Calculate the site depth of the current marker    
        g = np.random.poisson(mean_depth)
        # If the depth of the current marker is less than 0 (safeguard)
        if g < 0:
            g = 0
        # Round the depth of the current marker
        site_depth = round(g)
-        # Initialize x and y
+        # Initialize x and y (with and without error)
        x = 0
        y = 0
+        x_error = 0
+        y_error = 0
+        
        # If the depth of the current marker is 0, genotype is Missing Data
        if site_depth == 0:            
            genotype = "./."
+            genotype_error = "./."
        else :
-            # If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0)
+            # If the site is homozygous A (ref, 0/0)
            if genotype1 == "A" and genotype2 == "A":
                x = site_depth
                y = 0
                genotype = "0/0"
+                x_error = 0
+                y_error = 0
+                # Considering the sequencing and mapping error at each site for a given site depth
+                for j in range(0,site_depth):
+                    rnd = random.random()
+                    # If the random number is smaller the error rate, we increase the wronge allele depth
+                    if rnd < errA:
+                        x_error = x_error + 0
+                        y_error = y_error + 1
+                    else : 
+                        x_error = x_error + 1
+                        y_error = y_error + 0
+                if y_error == site_depth:
+                    genotype_error = "1/1"
+                elif y_error > 0:
+                    genotype_error = "0/1"
+                else :
+                    genotype_error = "0/0"
+
            # If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1)
            elif genotype1 == "B" and genotype2 == "B":
-                x = 1
+                x = 0
                y = site_depth
                genotype = "1/1"
+                x_error = 0
+                y_error = 0
+                # Considering the sequencing and mapping error at each site for a given site depth
+                for j in range(0,site_depth):
+                    rnd = random.random()
+                    # If the random number is smaller the error rate, we increase the wronge allele depth
+                    if rnd < errB:
+                        x_error = x_error + 1
+                        y_error = y_error + 0
+                    else : 
+                        x_error = x_error + 0
+                        y_error = y_error + 1
+                if x_error == site_depth:
+                    genotype_error = "0/0"
+                elif x_error > 0:
+                    genotype_error = "0/1"
+                else :
+                    genotype_error = "1/1"
+                    
            # If the current marker is neither A nor B, genotype if heterozygous H (0/1)
            else:
                # Generate a random number between 0 and the depth of the current marker
                x = random.randint(0,site_depth)
                y = site_depth - x
-                # If the depth of x of the current marker is 0, seen as homozygous
+                # Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error
+                x_error = x
+                y_error = y
+                # If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1)
                if x == 0:
                    genotype = "1/1"
-                # If the depth of y of the current marker is not 0, seen as homozygous
+                    genotype_error = "1/1"
+                # If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0)
                elif y == 0:
-                    genotype = "0/0"    
-                # If the depth x and y of the current marker is not 0, seen as heterozygous
+                    genotype = "0/0"
+                    genotype_error = "0/0"
+                # If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1)
                else :
-                    genotype = "0/1"                
+                    genotype = "0/1"
+                    genotype_error = "0/1"  
+           
        #print("reads",genotype1, genotype2)
        #print(genotype)
        # Append the genotype and the depth of the current marker to the segment list
-        finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."
+        finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."        
        segment.append(finalGenotype)
+        
+        # Append the genotype and the depth of the current marker to the segment_error list
+        finalGenotype_error = str(genotype_error) + ":" + str(site_depth) + ":" + str(x_error) + "," + str(y_error) + ":.:.:.:.:."
+        segment_error.append(finalGenotype_error)

-    return segment
-
+    return segment, segment_error, breakpoints

 # Generate a list of individuals
-def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor):
+def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB):
    matrix = []
+    matrix_error = []

    # Calculate the number of marker requiered
    markers_nb = round(chromosome_size * marker_density)
+    
    # Generate a sorted list of markers positions
-    # positions = sorted(random.sample(range(chromosome_size),size))
+    #markers_positions = sorted(random.sample(range(chromosome_size),size))
    markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int")

-    for i in range(nb_individuals):
-        print("individual " + str(i+1))
-        matrix.append(generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor))
+    with open('Breakpoints.csv', 'w') as breakpointFile:
+        breakpointFile.write(",".join(["sample","average_bkp_position", "bkp_start_position", "bkp_stop_position", "transitionType"]) + "\n");
+        for i in range(nb_individuals):
+            print("individual " + str(i+1))
+            result = generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor, errA, errB)
+            matrix.append(result[0])
+            matrix_error.append(result[1])
+            for bkp in result[2]:
+                startBkp = markers_positions[bkp[0] - 1]
+                stopBkp = markers_positions[bkp[0]]
+                averageBkpPosition = startBkp + round((stopBkp - startBkp)/2)
+                bkp_row = [str(i), str(averageBkpPosition), str(startBkp), str(stopBkp), bkp[1]]
+                breakpointFile.write(",".join(bkp_row) + "\n");
    
-    return matrix, markers_positions
+    return matrix, matrix_error, markers_positions

 # # Check if a row is correct
 # def is_correct (row):
@@ -142,20 +236,21 @@ def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_
 #     return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]

 # Set the parameters for the script
-nb_individuals = 3
+nb_individuals = 100
 chromosome_size = 44000000
 cMsize = 180    ## Size of genetic map
 conversion_factor = chromosome_size/cMsize   ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division!
-#conversion_factor = 233000
-print(conversion_factor)
-marker_density = 255000/44000000
+marker_density = 0.0055
 mean_depth = 3
-max_depth = 10 #TODO
-err_rate = 0.02
-pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor)
-matrix = pop[0]
-markers_positions = pop[1]    
+max_depth = 6 #TODO (better estimate of max?)
+errA = 0.005
+errB = 0.005

+## Generate the pop and associate the results to the matrixes with and without errors
+pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB)
+matrix = pop[0]
+matrix_error = pop[1]
+markers_positions = pop[2]

 # Header to add to the VCF for it to be recognized as such file.    
 header = [
@@ -180,7 +275,8 @@ header = [
 ]


-with open('Newtest.vcf', 'w') as file:
+## Write the VCF files (with and without errors)
+with open('test.vcf', 'w') as file:
    # Write the header of the file
    file.writelines(header)
    
@@ -194,4 +290,20 @@ with open('Newtest.vcf', 'w') as file:
            row.append(matrix[i][m])
        row.append("0/0:"+ str(mean_depth) + ":" + str(mean_depth) + ",0:.:.:.:.")
        row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.")
+        file.write("\t".join(row) + "\n");
+
+with open('test_error.vcf', 'w') as file:
+    # Write the header of the file
+    file.writelines(header)
+    
+    # Write the header of the file
+    file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"]) + "\n")
+    
+    # Iterate over each row in the table
+    for m in range(0, len(matrix_error[0])):
+        row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
+        for i in range(0, len(matrix_error)):
+            row.append(matrix_error[i][m])
+        row.append("0/0:"+ str(mean_depth) + ":" + str(mean_depth) + ",0:.:.:.:.")
+        row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.")
        file.write("\t".join(row) + "\n");
\ No newline at end of file