diff --git a/noisyplotype.py b/noisyplotype.py index 1388a0f6f3f5758b04ec09a87fc2b893793c2caa..055715f8b3a992f3519bdc6a4b51e64a6aa36348 100644 --- a/noisyplotype.py +++ b/noisyplotype.py @@ -1,56 +1,65 @@ import random -import csv import math import numpy as np #def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions, conversion_factor): -def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor): +def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, errA, errB): ''' Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth). Args: - chromosome_size (int): The size of the chromosome. - marker_density (float): The density of markers. - err_rate (float): The error rate of the chromosome. - mean_depth (float): The mean depth of the chromosome. - max_depth - markers_positions + mean_depth (float): The mean sequencing depth of the chromosome + markers_positions (lst) : liste of marker positions (picked on a grid) + conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size) + errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B Returns: - list: The chromosome. + segment: A chromosome with each position having a given genotype with a given site depth and allele depth + segment_error : A chromosome with each position having a given genotype with a given site depth and allele depth considering the error rates errA and errB ''' - + + # Create an empty list to store the chromosome segment = [] + # Create an empty list to store the chromosome with error + segment_error = [] + breakpoints = [] - # Iterate through the list of markers positions + # Iterate through the list of markers positions (fixed) + previousMarkerGenotype = "" for i in range(0, len(markers_positions)): + recomb1 = False + recomb2 = False # If the current marker is not the first one if i > 0: # Calculate the interval between the current marker and the previous one - #TODO - #Kosambi inverse function + #using the Kosambi inverse function to estimate the chance of recombination IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1] - tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor + tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor ## Conversion factor is used to have the bp per cM IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM))) rnd = random.random() - # If the random number is greater than the probability of a recombination - #print("before",genotype1, genotype2) - if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs + + # If the random number is greater than the probability of a recombination a recombination occurs + # Done for genotype 1 + if rnd > 1 - IntervalWithPreviousMarkerInRF: + recomb1 = True # If the previous marker is A, change the genotype to B if genotype1 == "A": - genotype1 = "B"; + genotype1 = "B" else : - genotype1 = "A"; + genotype1 = "A" + + # Done for genotype 2 rnd = random.random() if rnd > 1 - IntervalWithPreviousMarkerInRF: + recomb2 = True # If the previous marker is B, change the genotype to A if genotype2 == "A": - genotype2 = "B"; + genotype2 = "B" else : - genotype2 = "A"; - #print("after",genotype1, genotype2) - # If the current marker is the first one, fix the two first genotypes. + genotype2 = "A" + + # If the current marker is the first one, Set the two first genotypes (random). else: rnd = random.random() # If the random number is greater than 0.5 @@ -64,72 +73,157 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor): genotype2 = "B" else : genotype2 = "A" + + # initialize previousMarkerGenotype + if genotype1 == "A" and genotype2 == "A": + previousMarkerGenotype = "A" + elif genotype1 == "B" and genotype2 == "B": + previousMarkerGenotype = "B" + else : + previousMarkerGenotype = "H" + #Breakpoints + if recomb1 or recomb2: + #this is a breakpoint [beforeBkpAfter, transition] + transition = previousMarkerGenotype + " => " - #TODO : ajouter erreur de séquençage + if genotype1 == "A" and genotype2 == "A": + previousMarkerGenotype = "A" + elif genotype1 == "B" and genotype2 == "B": + previousMarkerGenotype = "B" + else : + previousMarkerGenotype = "H" + + transition = transition + previousMarkerGenotype - # Calculate the depth of the current marker + breakpoints.append([i,transition]) + + # Calculate the site depth of the current marker g = np.random.poisson(mean_depth) # If the depth of the current marker is less than 0 (safeguard) if g < 0: g = 0 # Round the depth of the current marker site_depth = round(g) - # Initialize x and y + # Initialize x and y (with and without error) x = 0 y = 0 + x_error = 0 + y_error = 0 + # If the depth of the current marker is 0, genotype is Missing Data if site_depth == 0: genotype = "./." + genotype_error = "./." else : - # If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0) + # If the site is homozygous A (ref, 0/0) if genotype1 == "A" and genotype2 == "A": x = site_depth y = 0 genotype = "0/0" + x_error = 0 + y_error = 0 + # Considering the sequencing and mapping error at each site for a given site depth + for j in range(0,site_depth): + rnd = random.random() + # If the random number is smaller the error rate, we increase the wronge allele depth + if rnd < errA: + x_error = x_error + 0 + y_error = y_error + 1 + else : + x_error = x_error + 1 + y_error = y_error + 0 + if y_error == site_depth: + genotype_error = "1/1" + elif y_error > 0: + genotype_error = "0/1" + else : + genotype_error = "0/0" + # If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1) elif genotype1 == "B" and genotype2 == "B": - x = 1 + x = 0 y = site_depth genotype = "1/1" + x_error = 0 + y_error = 0 + # Considering the sequencing and mapping error at each site for a given site depth + for j in range(0,site_depth): + rnd = random.random() + # If the random number is smaller the error rate, we increase the wronge allele depth + if rnd < errB: + x_error = x_error + 1 + y_error = y_error + 0 + else : + x_error = x_error + 0 + y_error = y_error + 1 + if x_error == site_depth: + genotype_error = "0/0" + elif x_error > 0: + genotype_error = "0/1" + else : + genotype_error = "1/1" + # If the current marker is neither A nor B, genotype if heterozygous H (0/1) else: # Generate a random number between 0 and the depth of the current marker x = random.randint(0,site_depth) y = site_depth - x - # If the depth of x of the current marker is 0, seen as homozygous + # Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error + x_error = x + y_error = y + # If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1) if x == 0: genotype = "1/1" - # If the depth of y of the current marker is not 0, seen as homozygous + genotype_error = "1/1" + # If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0) elif y == 0: - genotype = "0/0" - # If the depth x and y of the current marker is not 0, seen as heterozygous + genotype = "0/0" + genotype_error = "0/0" + # If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1) else : - genotype = "0/1" + genotype = "0/1" + genotype_error = "0/1" + #print("reads",genotype1, genotype2) #print(genotype) # Append the genotype and the depth of the current marker to the segment list - finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:." + finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:." segment.append(finalGenotype) + + # Append the genotype and the depth of the current marker to the segment_error list + finalGenotype_error = str(genotype_error) + ":" + str(site_depth) + ":" + str(x_error) + "," + str(y_error) + ":.:.:.:.:." + segment_error.append(finalGenotype_error) - return segment - + return segment, segment_error, breakpoints # Generate a list of individuals -def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor): +def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB): matrix = [] + matrix_error = [] # Calculate the number of marker requiered markers_nb = round(chromosome_size * marker_density) + # Generate a sorted list of markers positions - # positions = sorted(random.sample(range(chromosome_size),size)) + #markers_positions = sorted(random.sample(range(chromosome_size),size)) markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int") - for i in range(nb_individuals): - print("individual " + str(i+1)) - matrix.append(generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor)) + with open('Breakpoints.csv', 'w') as breakpointFile: + breakpointFile.write(",".join(["sample","average_bkp_position", "bkp_start_position", "bkp_stop_position", "transitionType"]) + "\n"); + for i in range(nb_individuals): + print("individual " + str(i+1)) + result = generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor, errA, errB) + matrix.append(result[0]) + matrix_error.append(result[1]) + for bkp in result[2]: + startBkp = markers_positions[bkp[0] - 1] + stopBkp = markers_positions[bkp[0]] + averageBkpPosition = startBkp + round((stopBkp - startBkp)/2) + bkp_row = [str(i), str(averageBkpPosition), str(startBkp), str(stopBkp), bkp[1]] + breakpointFile.write(",".join(bkp_row) + "\n"); - return matrix, markers_positions + return matrix, matrix_error, markers_positions # # Check if a row is correct # def is_correct (row): @@ -142,20 +236,21 @@ def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_ # return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3] # Set the parameters for the script -nb_individuals = 3 +nb_individuals = 100 chromosome_size = 44000000 cMsize = 180 ## Size of genetic map conversion_factor = chromosome_size/cMsize ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division! -#conversion_factor = 233000 -print(conversion_factor) -marker_density = 255000/44000000 +marker_density = 0.0055 mean_depth = 3 -max_depth = 10 #TODO -err_rate = 0.02 -pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor) -matrix = pop[0] -markers_positions = pop[1] +max_depth = 6 #TODO (better estimate of max?) +errA = 0.005 +errB = 0.005 +## Generate the pop and associate the results to the matrixes with and without errors +pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB) +matrix = pop[0] +matrix_error = pop[1] +markers_positions = pop[2] # Header to add to the VCF for it to be recognized as such file. header = [ @@ -180,7 +275,8 @@ header = [ ] -with open('Newtest.vcf', 'w') as file: +## Write the VCF files (with and without errors) +with open('test.vcf', 'w') as file: # Write the header of the file file.writelines(header) @@ -194,4 +290,20 @@ with open('Newtest.vcf', 'w') as file: row.append(matrix[i][m]) row.append("0/0:"+ str(mean_depth) + ":" + str(mean_depth) + ",0:.:.:.:.") row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.") + file.write("\t".join(row) + "\n"); + +with open('test_error.vcf', 'w') as file: + # Write the header of the file + file.writelines(header) + + # Write the header of the file + file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"]) + "\n") + + # Iterate over each row in the table + for m in range(0, len(matrix_error[0])): + row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"] + for i in range(0, len(matrix_error)): + row.append(matrix_error[i][m]) + row.append("0/0:"+ str(mean_depth) + ":" + str(mean_depth) + ",0:.:.:.:.") + row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.") file.write("\t".join(row) + "\n"); \ No newline at end of file