diff --git a/noisyplotype.py b/noisyplotype.py index 1388a0f6f3f5758b04ec09a87fc2b893793c2caa..52ac074e1f17e8325e2bb9654eea2e2a56a01682 100644 --- a/noisyplotype.py +++ b/noisyplotype.py @@ -1,27 +1,24 @@ import random -import csv import math import numpy as np #def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions, conversion_factor): -def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor): +def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, errA, errB): ''' Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth). Args: - chromosome_size (int): The size of the chromosome. - marker_density (float): The density of markers. - err_rate (float): The error rate of the chromosome. mean_depth (float): The mean depth of the chromosome. - max_depth - markers_positions + markers_positions (lst) : liste of marker positions (picked on a grid) + err_rate (float): The error rate of the chromosome. Returns: - list: The chromosome. + list: The chromosome with each position having a given genotype. ''' segment = [] + segment_error = [] # Iterate through the list of markers positions for i in range(0, len(markers_positions)): @@ -78,56 +75,99 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor): # Initialize x and y x = 0 y = 0 + x_error = 0 + y_error = 0 # If the depth of the current marker is 0, genotype is Missing Data if site_depth == 0: genotype = "./." + genotype_error = "./." else : # If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0) if genotype1 == "A" and genotype2 == "A": x = site_depth y = 0 genotype = "0/0" + x_error = 0 + y_error = 0 + for j in range(0,site_depth): + rnd = random.random() + # If the random number is greater than errB, we switch increase the wronge allele depth + if rnd > errA: + x_error = x_error + 0 + y_error = y_error + 1 + else : + x_error = x_error + 1 + y_error = y_error + 0 + if x_error > 0: + genotype_error = "0/1" + else : + genotype_error = "0/0" + # If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1) elif genotype1 == "B" and genotype2 == "B": x = 1 y = site_depth genotype = "1/1" + x_error = 0 + y_error = 0 + for j in range(0,site_depth): + rnd = random.random() + # If the random number is greater than errB, we switch increase the wronge allele depth + if rnd > errB: + x_error = x_error + 1 + y_error = y_error + 0 + else : + x_error = x_error + 0 + y_error = y_error + 1 + if x_error > 0: + genotype_error = "0/1" + else : + genotype_error = "1/1" # If the current marker is neither A nor B, genotype if heterozygous H (0/1) else: # Generate a random number between 0 and the depth of the current marker x = random.randint(0,site_depth) y = site_depth - x + x_error = x + y_error = y # If the depth of x of the current marker is 0, seen as homozygous if x == 0: genotype = "1/1" + genotype_error = "1/1" # If the depth of y of the current marker is not 0, seen as homozygous elif y == 0: - genotype = "0/0" + genotype = "0/0" + genotype_error = "0/0" # If the depth x and y of the current marker is not 0, seen as heterozygous else : - genotype = "0/1" + genotype = "0/1" + genotype_error = "0/1" #print("reads",genotype1, genotype2) #print(genotype) # Append the genotype and the depth of the current marker to the segment list finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:." + print(finalGenotype) segment.append(finalGenotype) + + finalGenotype_error = str(genotype_error) + ":" + str(site_depth) + ":" + str(x_error) + "," + str(y_error) + ":.:.:.:.:." + print(finalGenotype_error) + segment_error.append(finalGenotype_error) return segment - # Generate a list of individuals -def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor): +def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB): matrix = [] # Calculate the number of marker requiered markers_nb = round(chromosome_size * marker_density) # Generate a sorted list of markers positions - # positions = sorted(random.sample(range(chromosome_size),size)) + #markers_positions = sorted(random.sample(range(chromosome_size),size)) markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int") for i in range(nb_individuals): print("individual " + str(i+1)) - matrix.append(generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor)) + matrix.append(generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor, errA, errB)) return matrix, markers_positions @@ -142,17 +182,19 @@ def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_ # return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3] # Set the parameters for the script -nb_individuals = 3 -chromosome_size = 44000000 -cMsize = 180 ## Size of genetic map +nb_individuals = 1 +chromosome_size = 1000 +cMsize = 5 ## Size of genetic map conversion_factor = chromosome_size/cMsize ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division! -#conversion_factor = 233000 -print(conversion_factor) -marker_density = 255000/44000000 +#print(conversion_factor) +marker_density = 0.006 mean_depth = 3 -max_depth = 10 #TODO -err_rate = 0.02 -pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor) +max_depth = 6 #TODO +errA = 0.005 +errB = 0.005 + +# #size = 200000 +pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB) matrix = pop[0] markers_positions = pop[1] @@ -180,7 +222,7 @@ header = [ ] -with open('Newtest.vcf', 'w') as file: +with open('ABtest.vcf', 'w') as file: # Write the header of the file file.writelines(header)