fusion of simulation.vba and cecile method

108f7668 · alice.boizet_cirad.fr · ddade1d2 · 108f7668
Commit 108f7668 authored 1 year ago by alice.boizet_cirad.fr
--- a/noisyplotype.py
+++ b/noisyplotype.py
 import random
 import csv
+import math


 # '''Generate a random segment from a given genotype.
@@ -10,67 +11,90 @@ import csv
 # Returns:
    # str: A random segment from the given genotype.
 # '''
-def generate_segment (genotype, size, err_rate):
+def generate_chr (chromosome_size, marker_density, err_rate, mean_depth, sd_depth):

    # Create a list of all possible genotypes
-    genotypes = ["A", "B", "H", "H"]
+    #genotypes = ["A", "B", "H", "H"]
    
    # Create a list of all other genotypes that are not the given genotype
-    other_genotypes = [g for g in genotypes if g!= genotype]
+    #other_genotypes = [g for g in genotypes if g!= genotype]
    segment = []

-    # Generate a random segment from the given genotype
-    for i in range(size):
-        # If the probability of replacing a segment is less than or equal to the error rate
-        if random.random() <= err_rate:
-            # Replace the segment with a random other genotype
-            segment.append(random.choice(other_genotypes))
+    # random marker positions
+    size = round(chromosome_size * marker_density)
+    print(size)
+    print(chromosome_size)
+    print(marker_density)
+    positions = sorted(random.sample(range(chromosome_size),size)) #sorted list of markers positions
+
+    for i in range(0, size):
+        if i > 0:
+            #TODO 
+            #Kosambi inverse function
+            IntervalWithPreviousMarker = positions[i] - positions[i-1]
+            temp = 2 * (IntervalWithPreviousMarker / 100)
+            IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(temp) - math.exp(-temp)) / (math.exp(temp) + math.exp(-temp)))
+            rnd = random.random()
+            if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs
+                if genotype1 == "A":
+                    genotype1 = "B";
+                else :
+                    genotype1 = "A";
+                if genotype2 == "A":
+                    genotype2 = "B";
+                else :
+                    genotype1 = "A";
+
        else:
-            # Otherwise, append the given genotype to the segment
-            segment.append(genotype)
+            # first marker
+            rnd = random.random()
+            if rnd > 0.5:
+                genotype1 = "A"
+                genotype2 = "B"
+            else : 
+                genotype2 = "A"
+                genotype1 = "B"
+
+        #TODO : ajouter erreur de séquençage
+
+        # sd : standard deviation
+        site_depth = round(random.gauss(mean_depth, sd_depth))
+        x = 0
+        y = 0
+        if site_depth == 0:            
+            genotype = "./."
+        else :
+            if genotype1 == "A" and genotype2 == "A":
+                x = site_depth
+                y = 0
+                genotype = "0/0"
+            elif genotype1 == "B" and genotype2 == "B":
+                x = 1
+                y = site_depth
+                genotype = "1/1"
+            else:
+                #x = random.gauss(site_depth, sd2)
+                print(site_depth)
+                x = round(random.random(range(0,site_depth)))
+                y = site_depth - x
+                if x == 0:
+                    genotype = "1/1"
+                elif y == 0:
+                    genotype = "0/0"    
+                else :
+                    genotype = "0/1"                
+
+        
+        finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y)
+        segment.append(finalGenotype)

    # Return the random segment
    return "".join(segment)

-# Generate a chromosome with a given total size and error rate
-def generate_chromosome (total_size, err_rate):
-    min_transition = 1
-    max_transition = 2
-    min_segment_size = 1000
-
-    nb_transition = random.randint(min_transition, max_transition)
-
-    # Generate a list of probabilities for each transition
-    genotype_probability = ["A", "B", "H", "H"]
-    chromosome = []
-    genotypes = []
-    current_size = 0
-
-    for i in range(nb_transition + 1): # nb of segments is nb transition + 1
-        if i == nb_transition:
-            segment_size = total_size - current_size
-        else:
-            segment_size = random.randint(min_segment_size, (total_size - current_size) - ((nb_transition - i) * min_segment_size))
-       
-        # Generate a genotype with the given probability
-        if len(genotypes) > 0:
-            genotype = "H" if (genotypes[-1] == "A" or genotypes[-1] == "B") else random.choice(["A", "B"])  #random.choice([g for g in genotype_probability if g!= genotypes[-1]])
-        else:
-            genotype = random.choice(genotype_probability) # twice as many chances to be H
-            
-        # Generate a segment with the given genotype and size
-        segment = generate_segment(genotype, segment_size, err_rate)
-        chromosome.append(segment)
-                
-        genotypes.append(genotype)
-        current_size += segment_size
-    
-    # Join the chromosome with a '-'
-    return "".join(chromosome)
-
 # Generate a list of individuals
-def generate_individuals (nb_individuals, chromosome_size, err_rate):
-    return [generate_chromosome(chromosome_size, err_rate) for i in range(nb_individuals)]
+def generate_individuals (nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, sd_depth):
+    
+    return [generate_chr(chromosome_size, marker_density, err_rate, mean_depth, sd_depth) for i in range(nb_individuals)]

 # Check if a row is correct
 def is_correct (row):
@@ -85,7 +109,11 @@ def is_correct (row):
 chromosome_size = 100000
 nb_individuals = 100
 err_rate = 0.02
-pop = generate_individuals(nb_individuals, chromosome_size, err_rate)
+chromosome_size = 44000000
+marker_density = 255000/44000000
+mean_depth = 3
+sd_depth = 1
+pop = generate_individuals(nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, sd_depth)

 # Create an empty list to store the individuals
 rows = []
@@ -142,22 +170,10 @@ with open('test.tsv', 'w') as file:
        
        # Iterate over each genotype in the row
        for genotype in row:
-            # Set the genotype to '.' if it is not present
-            g = "."
-            
-            # If the genotype is 'H', set the genotype to 0/1
-            if genotype == "H":
-                g = "0/1"
-            # If the genotype is 'A', set the genotype to 0/0
-            elif genotype == "A":
-                g = "0/0"
-            # If the genotype is 'B', set the genotype to 1/1
-            elif genotype == "B":
-                g = "1/1"
-            
-            converted_row.append(g + ":.:.:.:.:.:.:.")
-
-        file.write("\t".join(["chr01", str(i + 1), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"] + converted_row + ["0/0:.:.:.:.:.:.:.", "1/1:.:.:.:.:.:.:."]) + "\n" )
+
+            converted_row.append(genotype + ":.:.:.:.:.")
+
+        file.write("\t".join(["chr01", str(i + 1), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"] + converted_row + ["0/0:"+ mean_depth + ":" + mean_depth + ",0:.:.:.:.:.", "1/1:"+ mean_depth + ":0," + mean_depth + ":.:.:.:.:."]) + "\n" )