Merge branch 'comments' into 'master'

Comments See merge request noisymputer/popsimul!1

Merge branch 'comments' into 'master'
Comments See merge request noisymputer/popsimul!1
ea5a11f7 · alice.boizet_cirad.fr · 5ff25ba1 · e9b3ba36 · ea5a11f7
Commit ea5a11f7 authored 1 year ago by alice.boizet_cirad.fr
--- a/noisyplotype.py
+++ b/noisyplotype.py
 import random
 import csv
 import math
+import numpy as np
+ 

+def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions):

-# '''Generate a random segment from a given genotype.
-# Args:
-    # genotype (str): The genotype to generate a segment from.
-    # size (int): The length of segments to generate (number of sites).
-    # err_rate (float): The probability at a site that the segment's genotype will be replaced by a random other genotype.
-# Returns:
-    # str: A random segment from the given genotype.
-# '''
-def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, sd_depth):
-
-    # Create a list of all possible genotypes
-    #genotypes = ["A", "B", "H", "H"]
+    '''
+    Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).
+    
+    Args:
+        chromosome_size (int): The size of the chromosome.
+        marker_density (float): The density of markers.
+        err_rate (float): The error rate of the chromosome.
+        mean_depth (float): The mean depth of the chromosome.
+        max_depth
+        markers_positions
    
-    # Create a list of all other genotypes that are not the given genotype
-    #other_genotypes = [g for g in genotypes if g!= genotype]
+    Returns:
+        list: The chromosome.
+    '''
+
    segment = []

-    # random marker positions
-    size = round(chromosome_size * marker_density)
-    positions = sorted(random.sample(range(chromosome_size),size)) #sorted list of markers positions
-    for i in range(0, size):
+    # Iterate through the list of markers positions
+    for i in range(0, len(markers_positions)):
+        # If the current marker is not the first one
        if i > 0:
+            # Calculate the interval between the current marker and the previous one
            #TODO 
            #Kosambi inverse function
-            IntervalWithPreviousMarker = positions[i] - positions[i-1]
-            temp = 2 * (IntervalWithPreviousMarker / 100)
-            IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(temp) - math.exp(-temp)) / (math.exp(temp) + math.exp(-temp)))
+            IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
+            tempcM = 2 * (IntervalWithPreviousMarker / 100) / len(markers_positions) # markers_nb (= len(markers_positions)) corresponds to bpPercM
+            IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
            rnd = random.random()
+            # If the random number is greater than the probability of a recombination
+            #print("before",genotype1, genotype2)
            if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs
+                # If the previous marker is A, change the genotype to B
                if genotype1 == "A":
                    genotype1 = "B";
                else :
                    genotype1 = "A";
+            rnd = random.random()
+            if rnd > 1 - IntervalWithPreviousMarkerInRF:
+                # If the previous marker is B, change the genotype to A
                if genotype2 == "A":
                    genotype2 = "B";
                else :
-                    genotype1 = "A";
-
+                    genotype2 = "A";
+            #print("after",genotype1, genotype2)
+        # If the current marker is the first one, fix the two first genotypes.
        else:
-            # first marker
            rnd = random.random()
+            # If the random number is greater than 0.5
            if rnd > 0.5:
                genotype1 = "A"
+            else : 
+                genotype1 = "B"
+
+            rnd = random.random()
+            if rnd > 0.5:
                genotype2 = "B"
            else : 
                genotype2 = "A"
-                genotype1 = "B"

-        #TODO : ajouter erreur de séquençage

-        # sd : standard deviation
-        g = random.gauss(mean_depth, sd_depth)
+        #TODO : ajouter erreur de séquençage
+        
+        # Calculate the depth of the current marker        
+        g = np.random.poisson(mean_depth)
+        # If the depth of the current marker is less than 0 (safeguard)
        if g < 0:
            g = 0
+        # Round the depth of the current marker
        site_depth = round(g)
+        # Initialize x and y
        x = 0
        y = 0
+        # If the depth of the current marker is 0, genotype is Missing Data
        if site_depth == 0:            
            genotype = "./."
        else :
+            # If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0)
            if genotype1 == "A" and genotype2 == "A":
                x = site_depth
                y = 0
                genotype = "0/0"
+            # If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1)
            elif genotype1 == "B" and genotype2 == "B":
                x = 1
                y = site_depth
                genotype = "1/1"
+            # If the current marker is neither A nor B, genotype if heterozygous H (0/1)
            else:
-                #x = random.gauss(site_depth, sd2)
+                # Generate a random number between 0 and the depth of the current marker
                x = random.randint(0,site_depth)
                y = site_depth - x
+                # If the depth of x of the current marker is 0, seen as homozygous
                if x == 0:
                    genotype = "1/1"
+                # If the depth of y of the current marker is not 0, seen as homozygous
                elif y == 0:
                    genotype = "0/0"    
+                # If the depth x and y of the current marker is not 0, seen as heterozygous
                else :
                    genotype = "0/1"                
-
-        
-        finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."
+        #print("reads",genotype1, genotype2)
+        #print(genotype)
+        # Append the genotype and the depth of the current marker to the segment list
+        finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:.:."
        segment.append(finalGenotype)

-    # Return the random segment
    return segment

+
 # Generate a list of individuals
-def generate_individuals (nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, sd_depth):
+def generate_individuals (nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, max_depth):
    matrix = []
+
+    # Calculate the number of marker requiered
+    markers_nb = round(chromosome_size * marker_density)
+    # Generate a sorted list of markers positions
+    # positions = sorted(random.sample(range(chromosome_size),size))
+    markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int")
+
    for i in range(nb_individuals):
        print("individual " + str(i+1))
-        matrix.append(generate_chr_for_one_ind(chromosome_size, marker_density, err_rate, mean_depth, sd_depth))
+        matrix.append(generate_chr_for_one_ind(chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions))
    
-    return matrix
+    return matrix, markers_positions

-# Check if a row is correct
-def is_correct (row):
-    l = len(row)
+# # Check if a row is correct
+# def is_correct (row):
+#     l = len(row)

-    count_h = row.count("H") / l
-    count_a = row.count("A") / l
-    count_b = row.count("B") / l
+#     count_h = row.count("H") / l
+#     count_a = row.count("A") / l
+#     count_b = row.count("B") / l
    
-    return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]
+#     return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]

-chromosome_size = 100000
-nb_individuals = 100
+# Set the parameters for the script
+nb_individuals = 3
 err_rate = 0.02
 chromosome_size = 44000000
 marker_density = 255000/44000000
 mean_depth = 3
-sd_depth = 1
-pop = generate_individuals(nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, sd_depth)
-
-# # Create an empty list to store the individuals
-# rows = []
-
-# # Loop through the population
-# for i in range(chromosome_size):
-#     # Create an empty list to store the individual
-#     row = []
-    
-#     # Loop through the individuals in the population
-#     for ind in pop:
-#         # Append the individual's chromosome to the row
-#         row.append(ind[i])
-        
-#     # Append the row to the list of rows
-#     #row.append(is_correct(row))
-    
-#     rows.append(row)
-    
+max_depth = 10 #TODO
+pop = generate_individuals(nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, max_depth)
+matrix = pop[0]
+markers_positions = pop[1]
    
+# Header to add to the VCF for it to be recognized as such file.    
 header = [
 "##fileformat=VCFv4.1\n",
 "##fileDate=20090805\n",
@@ -157,9 +174,8 @@ header = [
 "##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"segment Quality\">\n"
 ]

-#with open('test.vcf', 'w') as file:

-with open('test.tsv', 'w') as file:
+with open('test.vcf', 'w') as file:
    # Write the header of the file
    file.writelines(header)
    
@@ -167,26 +183,10 @@ with open('test.tsv', 'w') as file:
    file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"] + ["Test"]) + "\n")
    
    # Iterate over each row in the table
-    #for i, row in enumerate(rows):
-    for m in range(0, len(pop[0])):
-        row = ["chr01", str(m + 1), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
-        for i in range(0, len(pop)):
-            row.append(pop[i][m])
+    for m in range(0, len(matrix[0])):
+        row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
+        for i in range(0, len(matrix)):
+            row.append(matrix[i][m])
        row.append("0/0:"+ str(mean_depth) + ":" + str(mean_depth) + ",0:.:.:.:.:.")
        row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.:.")
-        file.write("\t".join(row) + "\n");
-
-        # # Create an empty list to store the converted row
-        # converted_row = []
-        
-        # # Iterate over each genotype in the row
-        # for genotype in ind:
-        #     converted_row.append(genotype + ":.:.:.:.:.")
-
-        # file.write("\t".join(["chr01", str(i + 1), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"] + converted_row + ["0/0:"+ mean_depth + ":" + mean_depth + ",0:.:.:.:.:.", "1/1:"+ mean_depth + ":0," + mean_depth + ":.:.:.:.:."]) + "\n" )
-        
-        
-        
-        ##import random; random.sample(range(101), 10)  ##10 random values in a range of 100 without duplicate.
-        
-        
\ No newline at end of file
+        file.write("\t".join(row) + "\n");
\ No newline at end of file