Skip to content
Snippets Groups Projects
Commit 845547da authored by Cecile Triay's avatar Cecile Triay
Browse files

Update comments and delete unused section of script.

parent c10927e4
No related branches found
No related tags found
No related merge requests found
......@@ -9,36 +9,41 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).
Args:
mean_depth (float): The mean depth of the chromosome.
mean_depth (float): The mean sequencing depth of the chromosome
markers_positions (lst) : liste of marker positions (picked on a grid)
err_rate (float): The error rate of the chromosome.
conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size)
errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B
Returns:
list: The chromosome with each position having a given genotype.
segment: A chromosome with each position having a given genotype with a given site depth and allele depth
segment_error : A chromosome with each position having a given genotype with a given site depth and allele depth considering the error rates errA and errB
'''
# Create an empty list to store the chromosome
segment = []
# Create an empty list to store the chromosome with error
segment_error = []
# Iterate through the list of markers positions
# Iterate through the list of markers positions (fixed)
for i in range(0, len(markers_positions)):
# If the current marker is not the first one
if i > 0:
# Calculate the interval between the current marker and the previous one
#TODO
#Kosambi inverse function
#using the Kosambi inverse function to estimate the chance of recombination
IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor
tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor ## Conversion factor is used to have the bp per cM
IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
rnd = random.random()
# If the random number is greater than the probability of a recombination
#print("before",genotype1, genotype2)
if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs
# If the random number is greater than the probability of a recombination a recombination occurs
# Done for genotype 1
if rnd > 1 - IntervalWithPreviousMarkerInRF:
# If the previous marker is A, change the genotype to B
if genotype1 == "A":
genotype1 = "B";
else :
genotype1 = "A";
# Done for genotype 2
rnd = random.random()
if rnd > 1 - IntervalWithPreviousMarkerInRF:
# If the previous marker is B, change the genotype to A
......@@ -46,8 +51,8 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
genotype2 = "B";
else :
genotype2 = "A";
#print("after",genotype1, genotype2)
# If the current marker is the first one, fix the two first genotypes.
# If the current marker is the first one, Set the two first genotypes (random).
else:
rnd = random.random()
# If the random number is greater than 0.5
......@@ -61,37 +66,36 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
genotype2 = "B"
else :
genotype2 = "A"
#TODO : ajouter erreur de séquençage
# Calculate the depth of the current marker
# Calculate the site depth of the current marker
g = np.random.poisson(mean_depth)
# If the depth of the current marker is less than 0 (safeguard)
if g < 0:
g = 0
# Round the depth of the current marker
site_depth = round(g)
# Initialize x and y
# Initialize x and y (with and without error)
x = 0
y = 0
x_error = 0
y_error = 0
# If the depth of the current marker is 0, genotype is Missing Data
if site_depth == 0:
genotype = "./."
genotype_error = "./."
else :
# If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0)
# If the site is homozygous A (ref, 0/0)
if genotype1 == "A" and genotype2 == "A":
x = site_depth
y = 0
genotype = "0/0"
x_error = 0
y_error = 0
# Considering the sequencing and mapping error at each site for a given site depth
for j in range(0,site_depth):
rnd = random.random()
# If the random number is greater than errB, we switch increase the wronge allele depth
# If the random number is smaller the error rate, we increase the wronge allele depth
if rnd < errA:
x_error = x_error + 0
y_error = y_error + 1
......@@ -112,9 +116,10 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
genotype = "1/1"
x_error = 0
y_error = 0
# Considering the sequencing and mapping error at each site for a given site depth
for j in range(0,site_depth):
rnd = random.random()
# If the random number is greater than errB, we switch increase the wronge allele depth
# If the random number is smaller the error rate, we increase the wronge allele depth
if rnd < errB:
x_error = x_error + 1
y_error = y_error + 0
......@@ -127,22 +132,24 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
genotype_error = "0/1"
else :
genotype_error = "1/1"
# If the current marker is neither A nor B, genotype if heterozygous H (0/1)
else:
# Generate a random number between 0 and the depth of the current marker
x = random.randint(0,site_depth)
y = site_depth - x
# Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error
x_error = x
y_error = y
# If the depth of x of the current marker is 0, seen as homozygous
# If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1)
if x == 0:
genotype = "1/1"
genotype_error = "1/1"
# If the depth of y of the current marker is not 0, seen as homozygous
# If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0)
elif y == 0:
genotype = "0/0"
genotype_error = "0/0"
# If the depth x and y of the current marker is not 0, seen as heterozygous
# If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1)
else :
genotype = "0/1"
genotype_error = "0/1"
......@@ -152,6 +159,7 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor,
finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."
segment.append(finalGenotype)
# Append the genotype and the depth of the current marker to the segment_error list
finalGenotype_error = str(genotype_error) + ":" + str(site_depth) + ":" + str(x_error) + "," + str(y_error) + ":.:.:.:.:."
segment_error.append(finalGenotype_error)
......@@ -164,6 +172,7 @@ def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_
# Calculate the number of marker requiered
markers_nb = round(chromosome_size * marker_density)
# Generate a sorted list of markers positions
#markers_positions = sorted(random.sample(range(chromosome_size),size))
markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int")
......@@ -188,17 +197,16 @@ def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_
# Set the parameters for the script
nb_individuals = 100
chromosome_size = 44000000
chromosome_size = 44000000 ## Size of chromosome in bp
cMsize = 180 ## Size of genetic map
conversion_factor = chromosome_size/cMsize ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division!
#print(conversion_factor)
marker_density = 0.0055
mean_depth = 3
max_depth = 6 #TODO
max_depth = 6 #TODO (better estimate of max?)
errA = 0.005
errB = 0.005
# #size = 200000
## Generate the pop and associate the results to the matrixes with and without errors
pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB)
matrix = pop[0]
matrix_error = pop[1]
......@@ -227,7 +235,8 @@ header = [
]
with open('ABtest.vcf', 'w') as file:
## Write the VCF files (with and without errors)
with open('test.vcf', 'w') as file:
# Write the header of the file
file.writelines(header)
......@@ -243,7 +252,7 @@ with open('ABtest.vcf', 'w') as file:
row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.")
file.write("\t".join(row) + "\n");
with open('ABtest_error.vcf', 'w') as file:
with open('test_error.vcf', 'w') as file:
# Write the header of the file
file.writelines(header)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment