Update comments and delete unused section of script.

845547da · Cecile Triay · c10927e4 · 845547da
Commit 845547da authored 1 year ago by Cecile Triay
--- a/noisyplotype.py
+++ b/noisyplotype.py
@@ -9,36 +9,41 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
    Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).
    
    Args:
-        mean_depth (float): The mean depth of the chromosome.
+        mean_depth (float): The mean sequencing depth of the chromosome
        markers_positions (lst) : liste of marker positions (picked on a grid)
-        err_rate (float): The error rate of the chromosome.
+        conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size)
+        errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B
    
    Returns:
-        list: The chromosome with each position having a given genotype.
+        segment: A chromosome with each position having a given genotype with a given site depth and allele depth
+        segment_error : A chromosome with each position having a given genotype with a given site depth and allele depth considering the error rates errA and errB
    '''
-
+    
+    # Create an empty list to store the chromosome
    segment = []
+    # Create an empty list to store the chromosome with error
    segment_error = []
-
-    # Iterate through the list of markers positions
+    
+    # Iterate through the list of markers positions (fixed)
    for i in range(0, len(markers_positions)):
        # If the current marker is not the first one
        if i > 0:
            # Calculate the interval between the current marker and the previous one
-            #TODO 
-            #Kosambi inverse function
+            #using the Kosambi inverse function to estimate the chance of recombination
            IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
-            tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor 
+            tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor         ## Conversion factor is used to have the bp per cM 
            IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
            rnd = random.random()
-            # If the random number is greater than the probability of a recombination
-            #print("before",genotype1, genotype2)
-            if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs
+            # If the random number is greater than the probability of a recombination a recombination occurs
+            # Done for genotype 1
+            if rnd > 1 - IntervalWithPreviousMarkerInRF: 
                # If the previous marker is A, change the genotype to B
                if genotype1 == "A":
                    genotype1 = "B";
                else :
                    genotype1 = "A";
+                    
+            # Done for genotype 2
            rnd = random.random()
            if rnd > 1 - IntervalWithPreviousMarkerInRF:
                # If the previous marker is B, change the genotype to A
@@ -46,8 +51,8 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
                    genotype2 = "B";
                else :
                    genotype2 = "A";
-            #print("after",genotype1, genotype2)
-        # If the current marker is the first one, fix the two first genotypes.
+                    
+        # If the current marker is the first one, Set the two first genotypes (random).
        else:
            rnd = random.random()
            # If the random number is greater than 0.5
@@ -61,37 +66,36 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
                genotype2 = "B"
            else : 
                genotype2 = "A"
-
-
-        #TODO : ajouter erreur de séquençage
        
-        # Calculate the depth of the current marker        
+        # Calculate the site depth of the current marker    
        g = np.random.poisson(mean_depth)
        # If the depth of the current marker is less than 0 (safeguard)
        if g < 0:
            g = 0
        # Round the depth of the current marker
        site_depth = round(g)
-        # Initialize x and y
+        # Initialize x and y (with and without error)
        x = 0
        y = 0
        x_error = 0
        y_error = 0
+        
        # If the depth of the current marker is 0, genotype is Missing Data
        if site_depth == 0:            
            genotype = "./."
            genotype_error = "./."
        else :
-            # If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0)
+            # If the site is homozygous A (ref, 0/0)
            if genotype1 == "A" and genotype2 == "A":
                x = site_depth
                y = 0
                genotype = "0/0"
                x_error = 0
                y_error = 0
+                # Considering the sequencing and mapping error at each site for a given site depth
                for j in range(0,site_depth):
                    rnd = random.random()
-                    # If the random number is greater than errB, we switch increase the wronge allele depth
+                    # If the random number is smaller the error rate, we increase the wronge allele depth
                    if rnd < errA:
                        x_error = x_error + 0
                        y_error = y_error + 1
@@ -112,9 +116,10 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
                genotype = "1/1"
                x_error = 0
                y_error = 0
+                # Considering the sequencing and mapping error at each site for a given site depth
                for j in range(0,site_depth):
                    rnd = random.random()
-                    # If the random number is greater than errB, we switch increase the wronge allele depth
+                    # If the random number is smaller the error rate, we increase the wronge allele depth
                    if rnd < errB:
                        x_error = x_error + 1
                        y_error = y_error + 0
@@ -127,22 +132,24 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
                    genotype_error = "0/1"
                else :
                    genotype_error = "1/1"
+                    
            # If the current marker is neither A nor B, genotype if heterozygous H (0/1)
            else:
                # Generate a random number between 0 and the depth of the current marker
                x = random.randint(0,site_depth)
                y = site_depth - x
+                # Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error
                x_error = x
                y_error = y
-                # If the depth of x of the current marker is 0, seen as homozygous
+                # If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1)
                if x == 0:
                    genotype = "1/1"
                    genotype_error = "1/1"
-                # If the depth of y of the current marker is not 0, seen as homozygous
+                # If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0)
                elif y == 0:
                    genotype = "0/0"
                    genotype_error = "0/0"
-                # If the depth x and y of the current marker is not 0, seen as heterozygous
+                # If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1)
                else :
                    genotype = "0/1"
                    genotype_error = "0/1"                
@@ -152,6 +159,7 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
        finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."        
        segment.append(finalGenotype)
        
+        # Append the genotype and the depth of the current marker to the segment_error list
        finalGenotype_error = str(genotype_error) + ":" + str(site_depth) + ":" + str(x_error) + "," + str(y_error) + ":.:.:.:.:."
        segment_error.append(finalGenotype_error)

@@ -164,6 +172,7 @@ def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_

    # Calculate the number of marker requiered
    markers_nb = round(chromosome_size * marker_density)
+    
    # Generate a sorted list of markers positions
    #markers_positions = sorted(random.sample(range(chromosome_size),size))
    markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int")
@@ -188,17 +197,16 @@ def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_

 # Set the parameters for the script
 nb_individuals = 100
-chromosome_size = 44000000
+chromosome_size = 44000000  ## Size of chromosome in bp
 cMsize = 180    ## Size of genetic map
 conversion_factor = chromosome_size/cMsize   ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division!
-#print(conversion_factor)
 marker_density = 0.0055
 mean_depth = 3
-max_depth = 6 #TODO
+max_depth = 6 #TODO (better estimate of max?)
 errA = 0.005
 errB = 0.005

-# #size = 200000
+## Generate the pop and associate the results to the matrixes with and without errors
 pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB)
 matrix = pop[0]
 matrix_error = pop[1]
@@ -227,7 +235,8 @@ header = [
 ]


-with open('ABtest.vcf', 'w') as file:
+## Write the VCF files (with and without errors)
+with open('test.vcf', 'w') as file:
    # Write the header of the file
    file.writelines(header)
    
@@ -243,7 +252,7 @@ with open('ABtest.vcf', 'w') as file:
        row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.")
        file.write("\t".join(row) + "\n");

-with open('ABtest_error.vcf', 'w') as file:
+with open('test_error.vcf', 'w') as file:
    # Write the header of the file
    file.writelines(header)