diff --git a/noisyplotype.py b/noisyplotype.py index e363a1d847cefd8ff0bdcd10bcc3bed82db596a9..a6bb220c134d9524859347eee6d046753891abbf 100644 --- a/noisyplotype.py +++ b/noisyplotype.py @@ -1,141 +1,158 @@ import random import csv import math +import numpy as np + +def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions): -# '''Generate a random segment from a given genotype. -# Args: - # genotype (str): The genotype to generate a segment from. - # size (int): The length of segments to generate (number of sites). - # err_rate (float): The probability at a site that the segment's genotype will be replaced by a random other genotype. -# Returns: - # str: A random segment from the given genotype. -# ''' -def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, sd_depth): - - # Create a list of all possible genotypes - #genotypes = ["A", "B", "H", "H"] + ''' + Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth). + + Args: + chromosome_size (int): The size of the chromosome. + marker_density (float): The density of markers. + err_rate (float): The error rate of the chromosome. + mean_depth (float): The mean depth of the chromosome. + max_depth + markers_positions - # Create a list of all other genotypes that are not the given genotype - #other_genotypes = [g for g in genotypes if g!= genotype] + Returns: + list: The chromosome. + ''' + segment = [] - # random marker positions - size = round(chromosome_size * marker_density) - positions = sorted(random.sample(range(chromosome_size),size)) #sorted list of markers positions - for i in range(0, size): + # Iterate through the list of markers positions + for i in range(0, len(markers_positions)): + # If the current marker is not the first one if i > 0: + # Calculate the interval between the current marker and the previous one #TODO #Kosambi inverse function - IntervalWithPreviousMarker = positions[i] - positions[i-1] - temp = 2 * (IntervalWithPreviousMarker / 100) - IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(temp) - math.exp(-temp)) / (math.exp(temp) + math.exp(-temp))) + IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1] + tempcM = 2 * (IntervalWithPreviousMarker / 100) / len(markers_positions) # markers_nb (= len(markers_positions)) corresponds to bpPercM + IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM))) rnd = random.random() + # If the random number is greater than the probability of a recombination + #print("before",genotype1, genotype2) if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs + # If the previous marker is A, change the genotype to B if genotype1 == "A": genotype1 = "B"; else : genotype1 = "A"; + rnd = random.random() + if rnd > 1 - IntervalWithPreviousMarkerInRF: + # If the previous marker is B, change the genotype to A if genotype2 == "A": genotype2 = "B"; else : - genotype1 = "A"; - + genotype2 = "A"; + #print("after",genotype1, genotype2) + # If the current marker is the first one, fix the two first genotypes. else: - # first marker rnd = random.random() + # If the random number is greater than 0.5 if rnd > 0.5: genotype1 = "A" + else : + genotype1 = "B" + + rnd = random.random() + if rnd > 0.5: genotype2 = "B" else : genotype2 = "A" - genotype1 = "B" - #TODO : ajouter erreur de séquençage - # sd : standard deviation - g = random.gauss(mean_depth, sd_depth) + #TODO : ajouter erreur de séquençage + + # Calculate the depth of the current marker + g = np.random.poisson(mean_depth) + # If the depth of the current marker is less than 0 (safeguard) if g < 0: g = 0 + # Round the depth of the current marker site_depth = round(g) + # Initialize x and y x = 0 y = 0 + # If the depth of the current marker is 0, genotype is Missing Data if site_depth == 0: genotype = "./." else : + # If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0) if genotype1 == "A" and genotype2 == "A": x = site_depth y = 0 genotype = "0/0" + # If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1) elif genotype1 == "B" and genotype2 == "B": x = 1 y = site_depth genotype = "1/1" + # If the current marker is neither A nor B, genotype if heterozygous H (0/1) else: - #x = random.gauss(site_depth, sd2) + # Generate a random number between 0 and the depth of the current marker x = random.randint(0,site_depth) y = site_depth - x + # If the depth of x of the current marker is 0, seen as homozygous if x == 0: genotype = "1/1" + # If the depth of y of the current marker is not 0, seen as homozygous elif y == 0: genotype = "0/0" + # If the depth x and y of the current marker is not 0, seen as heterozygous else : genotype = "0/1" - - - finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:." + #print("reads",genotype1, genotype2) + #print(genotype) + # Append the genotype and the depth of the current marker to the segment list + finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:.:." segment.append(finalGenotype) - # Return the random segment return segment + # Generate a list of individuals -def generate_individuals (nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, sd_depth): +def generate_individuals (nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, max_depth): matrix = [] + + # Calculate the number of marker requiered + markers_nb = round(chromosome_size * marker_density) + # Generate a sorted list of markers positions + # positions = sorted(random.sample(range(chromosome_size),size)) + markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int") + for i in range(nb_individuals): print("individual " + str(i+1)) - matrix.append(generate_chr_for_one_ind(chromosome_size, marker_density, err_rate, mean_depth, sd_depth)) + matrix.append(generate_chr_for_one_ind(chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions)) - return matrix + return matrix, markers_positions -# Check if a row is correct -def is_correct (row): - l = len(row) +# # Check if a row is correct +# def is_correct (row): +# l = len(row) - count_h = row.count("H") / l - count_a = row.count("A") / l - count_b = row.count("B") / l +# count_h = row.count("H") / l +# count_a = row.count("A") / l +# count_b = row.count("B") / l - return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3] +# return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3] -chromosome_size = 100000 -nb_individuals = 100 +# Set the parameters for the script +nb_individuals = 3 err_rate = 0.02 chromosome_size = 44000000 marker_density = 255000/44000000 mean_depth = 3 -sd_depth = 1 -pop = generate_individuals(nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, sd_depth) - -# # Create an empty list to store the individuals -# rows = [] - -# # Loop through the population -# for i in range(chromosome_size): -# # Create an empty list to store the individual -# row = [] - -# # Loop through the individuals in the population -# for ind in pop: -# # Append the individual's chromosome to the row -# row.append(ind[i]) - -# # Append the row to the list of rows -# #row.append(is_correct(row)) - -# rows.append(row) - +max_depth = 10 #TODO +pop = generate_individuals(nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, max_depth) +matrix = pop[0] +markers_positions = pop[1] +# Header to add to the VCF for it to be recognized as such file. header = [ "##fileformat=VCFv4.1\n", "##fileDate=20090805\n", @@ -157,9 +174,8 @@ header = [ "##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"segment Quality\">\n" ] -#with open('test.vcf', 'w') as file: -with open('test.tsv', 'w') as file: +with open('test.vcf', 'w') as file: # Write the header of the file file.writelines(header) @@ -167,26 +183,10 @@ with open('test.tsv', 'w') as file: file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"] + ["Test"]) + "\n") # Iterate over each row in the table - #for i, row in enumerate(rows): - for m in range(0, len(pop[0])): - row = ["chr01", str(m + 1), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"] - for i in range(0, len(pop)): - row.append(pop[i][m]) + for m in range(0, len(matrix[0])): + row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"] + for i in range(0, len(matrix)): + row.append(matrix[i][m]) row.append("0/0:"+ str(mean_depth) + ":" + str(mean_depth) + ",0:.:.:.:.:.") row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.:.") - file.write("\t".join(row) + "\n"); - - # # Create an empty list to store the converted row - # converted_row = [] - - # # Iterate over each genotype in the row - # for genotype in ind: - # converted_row.append(genotype + ":.:.:.:.:.") - - # file.write("\t".join(["chr01", str(i + 1), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"] + converted_row + ["0/0:"+ mean_depth + ":" + mean_depth + ",0:.:.:.:.:.", "1/1:"+ mean_depth + ":0," + mean_depth + ":.:.:.:.:."]) + "\n" ) - - - - ##import random; random.sample(range(101), 10) ##10 random values in a range of 100 without duplicate. - - \ No newline at end of file + file.write("\t".join(row) + "\n"); \ No newline at end of file