diff --git a/noisyplotype.py b/noisyplotype.py index 14f6ee26ad2fc6f90f39f9482497a5b093d9fd39..dc7a334fe7d2bfeb066f25654e44fe4a93500496 100644 --- a/noisyplotype.py +++ b/noisyplotype.py @@ -9,19 +9,23 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth). Args: - mean_depth (float): The mean depth of the chromosome. + mean_depth (float): The mean sequencing depth of the chromosome markers_positions (lst) : liste of marker positions (picked on a grid) - err_rate (float): The error rate of the chromosome. + conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size) + errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B Returns: - list: The chromosome with each position having a given genotype. + segment: A chromosome with each position having a given genotype with a given site depth and allele depth + segment_error : A chromosome with each position having a given genotype with a given site depth and allele depth considering the error rates errA and errB ''' - + + # Create an empty list to store the chromosome segment = [] + # Create an empty list to store the chromosome with error segment_error = [] breakpoints = [] - # Iterate through the list of markers positions + # Iterate through the list of markers positions (fixed) previousMarkerGenotype = "" for i in range(0, len(markers_positions)): recomb1 = False @@ -29,21 +33,23 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, # If the current marker is not the first one if i > 0: # Calculate the interval between the current marker and the previous one - #TODO - #Kosambi inverse function + #using the Kosambi inverse function to estimate the chance of recombination IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1] - tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor + tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor ## Conversion factor is used to have the bp per cM IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM))) rnd = random.random() - # If the random number is greater than the probability of a recombination - #print("before",genotype1, genotype2) - if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs + + # If the random number is greater than the probability of a recombination a recombination occurs + # Done for genotype 1 + if rnd > 1 - IntervalWithPreviousMarkerInRF: recomb1 = True # If the previous marker is A, change the genotype to B if genotype1 == "A": genotype1 = "B"; else : genotype1 = "A"; + + # Done for genotype 2 rnd = random.random() if rnd > 1 - IntervalWithPreviousMarkerInRF: recomb2 = True @@ -52,8 +58,8 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, genotype2 = "B" else : genotype2 = "A" - #print("after",genotype1, genotype2) - # If the current marker is the first one, fix the two first genotypes. + + # If the current marker is the first one, Set the two first genotypes (random). else: rnd = random.random() # If the random number is greater than 0.5 @@ -92,33 +98,35 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, breakpoints.append([i,transition]) - # Calculate the depth of the current marker + # Calculate the site depth of the current marker g = np.random.poisson(mean_depth) # If the depth of the current marker is less than 0 (safeguard) if g < 0: g = 0 # Round the depth of the current marker site_depth = round(g) - # Initialize x and y + # Initialize x and y (with and without error) x = 0 y = 0 x_error = 0 y_error = 0 + # If the depth of the current marker is 0, genotype is Missing Data if site_depth == 0: genotype = "./." genotype_error = "./." else : - # If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0) + # If the site is homozygous A (ref, 0/0) if genotype1 == "A" and genotype2 == "A": x = site_depth y = 0 genotype = "0/0" x_error = 0 y_error = 0 + # Considering the sequencing and mapping error at each site for a given site depth for j in range(0,site_depth): rnd = random.random() - # If the random number is smaller than errA, we switch increase the wrong allele depth + # If the random number is smaller the error rate, we increase the wronge allele depth if rnd < errA: x_error = x_error + 0 y_error = y_error + 1 @@ -139,9 +147,10 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, genotype = "1/1" x_error = 0 y_error = 0 + # Considering the sequencing and mapping error at each site for a given site depth for j in range(0,site_depth): rnd = random.random() - # If the random number is smaller than errB, we switch increase the wronge allele depth + # If the random number is smaller the error rate, we increase the wronge allele depth if rnd < errB: x_error = x_error + 1 y_error = y_error + 0 @@ -154,22 +163,24 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, genotype_error = "0/1" else : genotype_error = "1/1" + # If the current marker is neither A nor B, genotype if heterozygous H (0/1) else: # Generate a random number between 0 and the depth of the current marker x = random.randint(0,site_depth) y = site_depth - x + # Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error x_error = x y_error = y - # If the depth of x of the current marker is 0, seen as homozygous + # If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1) if x == 0: genotype = "1/1" genotype_error = "1/1" - # If the depth of y of the current marker is not 0, seen as homozygous + # If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0) elif y == 0: genotype = "0/0" genotype_error = "0/0" - # If the depth x and y of the current marker is not 0, seen as heterozygous + # If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1) else : genotype = "0/1" genotype_error = "0/1" @@ -180,6 +191,7 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:." segment.append(finalGenotype) + # Append the genotype and the depth of the current marker to the segment_error list finalGenotype_error = str(genotype_error) + ":" + str(site_depth) + ":" + str(x_error) + "," + str(y_error) + ":.:.:.:.:." segment_error.append(finalGenotype_error) @@ -193,6 +205,7 @@ def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_ # Calculate the number of marker requiered markers_nb = round(chromosome_size * marker_density) + # Generate a sorted list of markers positions #markers_positions = sorted(random.sample(range(chromosome_size),size)) markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int") @@ -228,14 +241,13 @@ nb_individuals = 3 chromosome_size = 44000000 cMsize = 180 ## Size of genetic map conversion_factor = chromosome_size/cMsize ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division! -#print(conversion_factor) marker_density = 0.0055 mean_depth = 3 -max_depth = 6 #TODO +max_depth = 6 #TODO (better estimate of max?) errA = 0.005 errB = 0.005 -# #size = 200000 +## Generate the pop and associate the results to the matrixes with and without errors pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB) matrix = pop[0] matrix_error = pop[1] @@ -264,7 +276,8 @@ header = [ ] -with open('ABtest.vcf', 'w') as file: +## Write the VCF files (with and without errors) +with open('test.vcf', 'w') as file: # Write the header of the file file.writelines(header) @@ -280,7 +293,7 @@ with open('ABtest.vcf', 'w') as file: row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.") file.write("\t".join(row) + "\n"); -with open('ABtest_error.vcf', 'w') as file: +with open('test_error.vcf', 'w') as file: # Write the header of the file file.writelines(header)