Skip to content
Snippets Groups Projects
Commit bf407962 authored by alice.boizet_cirad.fr's avatar alice.boizet_cirad.fr
Browse files

Merge branch 'errorrates' into 'master'

Adding Error rates to noisyplotype.py

See merge request noisymputer/popsimul!2
parents 6d8b8423 c0f82cd6
No related branches found
No related tags found
No related merge requests found
import random
import csv
import math
import numpy as np
#def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions, conversion_factor):
def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor):
def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, errA, errB):
'''
Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).
Args:
chromosome_size (int): The size of the chromosome.
marker_density (float): The density of markers.
err_rate (float): The error rate of the chromosome.
mean_depth (float): The mean depth of the chromosome.
max_depth
markers_positions
mean_depth (float): The mean sequencing depth of the chromosome
markers_positions (lst) : liste of marker positions (picked on a grid)
conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size)
errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B
Returns:
list: The chromosome.
segment: A chromosome with each position having a given genotype with a given site depth and allele depth
segment_error : A chromosome with each position having a given genotype with a given site depth and allele depth considering the error rates errA and errB
'''
# Create an empty list to store the chromosome
segment = []
# Create an empty list to store the chromosome with error
segment_error = []
breakpoints = []
# Iterate through the list of markers positions
# Iterate through the list of markers positions (fixed)
previousMarkerGenotype = ""
for i in range(0, len(markers_positions)):
recomb1 = False
recomb2 = False
# If the current marker is not the first one
if i > 0:
# Calculate the interval between the current marker and the previous one
#TODO
#Kosambi inverse function
#using the Kosambi inverse function to estimate the chance of recombination
IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor
tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor ## Conversion factor is used to have the bp per cM
IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
rnd = random.random()
# If the random number is greater than the probability of a recombination
#print("before",genotype1, genotype2)
if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs
# If the random number is greater than the probability of a recombination a recombination occurs
# Done for genotype 1
if rnd > 1 - IntervalWithPreviousMarkerInRF:
recomb1 = True
# If the previous marker is A, change the genotype to B
if genotype1 == "A":
genotype1 = "B";
genotype1 = "B"
else :
genotype1 = "A";
genotype1 = "A"
# Done for genotype 2
rnd = random.random()
if rnd > 1 - IntervalWithPreviousMarkerInRF:
recomb2 = True
# If the previous marker is B, change the genotype to A
if genotype2 == "A":
genotype2 = "B";
genotype2 = "B"
else :
genotype2 = "A";
#print("after",genotype1, genotype2)
# If the current marker is the first one, fix the two first genotypes.
genotype2 = "A"
# If the current marker is the first one, Set the two first genotypes (random).
else:
rnd = random.random()
# If the random number is greater than 0.5
......@@ -64,72 +73,157 @@ def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor):
genotype2 = "B"
else :
genotype2 = "A"
# initialize previousMarkerGenotype
if genotype1 == "A" and genotype2 == "A":
previousMarkerGenotype = "A"
elif genotype1 == "B" and genotype2 == "B":
previousMarkerGenotype = "B"
else :
previousMarkerGenotype = "H"
#Breakpoints
if recomb1 or recomb2:
#this is a breakpoint [beforeBkpAfter, transition]
transition = previousMarkerGenotype + " => "
#TODO : ajouter erreur de séquençage
if genotype1 == "A" and genotype2 == "A":
previousMarkerGenotype = "A"
elif genotype1 == "B" and genotype2 == "B":
previousMarkerGenotype = "B"
else :
previousMarkerGenotype = "H"
transition = transition + previousMarkerGenotype
# Calculate the depth of the current marker
breakpoints.append([i,transition])
# Calculate the site depth of the current marker
g = np.random.poisson(mean_depth)
# If the depth of the current marker is less than 0 (safeguard)
if g < 0:
g = 0
# Round the depth of the current marker
site_depth = round(g)
# Initialize x and y
# Initialize x and y (with and without error)
x = 0
y = 0
x_error = 0
y_error = 0
# If the depth of the current marker is 0, genotype is Missing Data
if site_depth == 0:
genotype = "./."
genotype_error = "./."
else :
# If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0)
# If the site is homozygous A (ref, 0/0)
if genotype1 == "A" and genotype2 == "A":
x = site_depth
y = 0
genotype = "0/0"
x_error = 0
y_error = 0
# Considering the sequencing and mapping error at each site for a given site depth
for j in range(0,site_depth):
rnd = random.random()
# If the random number is smaller the error rate, we increase the wronge allele depth
if rnd < errA:
x_error = x_error + 0
y_error = y_error + 1
else :
x_error = x_error + 1
y_error = y_error + 0
if y_error == site_depth:
genotype_error = "1/1"
elif y_error > 0:
genotype_error = "0/1"
else :
genotype_error = "0/0"
# If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1)
elif genotype1 == "B" and genotype2 == "B":
x = 1
x = 0
y = site_depth
genotype = "1/1"
x_error = 0
y_error = 0
# Considering the sequencing and mapping error at each site for a given site depth
for j in range(0,site_depth):
rnd = random.random()
# If the random number is smaller the error rate, we increase the wronge allele depth
if rnd < errB:
x_error = x_error + 1
y_error = y_error + 0
else :
x_error = x_error + 0
y_error = y_error + 1
if x_error == site_depth:
genotype_error = "0/0"
elif x_error > 0:
genotype_error = "0/1"
else :
genotype_error = "1/1"
# If the current marker is neither A nor B, genotype if heterozygous H (0/1)
else:
# Generate a random number between 0 and the depth of the current marker
x = random.randint(0,site_depth)
y = site_depth - x
# If the depth of x of the current marker is 0, seen as homozygous
# Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error
x_error = x
y_error = y
# If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1)
if x == 0:
genotype = "1/1"
# If the depth of y of the current marker is not 0, seen as homozygous
genotype_error = "1/1"
# If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0)
elif y == 0:
genotype = "0/0"
# If the depth x and y of the current marker is not 0, seen as heterozygous
genotype = "0/0"
genotype_error = "0/0"
# If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1)
else :
genotype = "0/1"
genotype = "0/1"
genotype_error = "0/1"
#print("reads",genotype1, genotype2)
#print(genotype)
# Append the genotype and the depth of the current marker to the segment list
finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."
finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."
segment.append(finalGenotype)
# Append the genotype and the depth of the current marker to the segment_error list
finalGenotype_error = str(genotype_error) + ":" + str(site_depth) + ":" + str(x_error) + "," + str(y_error) + ":.:.:.:.:."
segment_error.append(finalGenotype_error)
return segment
return segment, segment_error, breakpoints
# Generate a list of individuals
def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor):
def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB):
matrix = []
matrix_error = []
# Calculate the number of marker requiered
markers_nb = round(chromosome_size * marker_density)
# Generate a sorted list of markers positions
# positions = sorted(random.sample(range(chromosome_size),size))
#markers_positions = sorted(random.sample(range(chromosome_size),size))
markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int")
for i in range(nb_individuals):
print("individual " + str(i+1))
matrix.append(generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor))
with open('Breakpoints.csv', 'w') as breakpointFile:
breakpointFile.write(",".join(["sample","average_bkp_position", "bkp_start_position", "bkp_stop_position", "transitionType"]) + "\n");
for i in range(nb_individuals):
print("individual " + str(i+1))
result = generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor, errA, errB)
matrix.append(result[0])
matrix_error.append(result[1])
for bkp in result[2]:
startBkp = markers_positions[bkp[0] - 1]
stopBkp = markers_positions[bkp[0]]
averageBkpPosition = startBkp + round((stopBkp - startBkp)/2)
bkp_row = [str(i), str(averageBkpPosition), str(startBkp), str(stopBkp), bkp[1]]
breakpointFile.write(",".join(bkp_row) + "\n");
return matrix, markers_positions
return matrix, matrix_error, markers_positions
# # Check if a row is correct
# def is_correct (row):
......@@ -142,20 +236,21 @@ def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_
# return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]
# Set the parameters for the script
nb_individuals = 3
nb_individuals = 100
chromosome_size = 44000000
cMsize = 180 ## Size of genetic map
conversion_factor = chromosome_size/cMsize ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division!
#conversion_factor = 233000
print(conversion_factor)
marker_density = 255000/44000000
marker_density = 0.0055
mean_depth = 3
max_depth = 10 #TODO
err_rate = 0.02
pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor)
matrix = pop[0]
markers_positions = pop[1]
max_depth = 6 #TODO (better estimate of max?)
errA = 0.005
errB = 0.005
## Generate the pop and associate the results to the matrixes with and without errors
pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB)
matrix = pop[0]
matrix_error = pop[1]
markers_positions = pop[2]
# Header to add to the VCF for it to be recognized as such file.
header = [
......@@ -180,7 +275,8 @@ header = [
]
with open('Newtest.vcf', 'w') as file:
## Write the VCF files (with and without errors)
with open('test.vcf', 'w') as file:
# Write the header of the file
file.writelines(header)
......@@ -194,4 +290,20 @@ with open('Newtest.vcf', 'w') as file:
row.append(matrix[i][m])
row.append("0/0:"+ str(mean_depth) + ":" + str(mean_depth) + ",0:.:.:.:.")
row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.")
file.write("\t".join(row) + "\n");
with open('test_error.vcf', 'w') as file:
# Write the header of the file
file.writelines(header)
# Write the header of the file
file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"]) + "\n")
# Iterate over each row in the table
for m in range(0, len(matrix_error[0])):
row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
for i in range(0, len(matrix_error)):
row.append(matrix_error[i][m])
row.append("0/0:"+ str(mean_depth) + ":" + str(mean_depth) + ",0:.:.:.:.")
row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.")
file.write("\t".join(row) + "\n");
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment