-
Cecile Triay authoredCecile Triay authored
noisyplotype.py 13.83 KiB
import random
import math
import numpy as np
#def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions, conversion_factor):
def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, errA, errB):
'''
Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).
Args:
mean_depth (float): The mean sequencing depth of the chromosome
markers_positions (lst) : liste of marker positions (picked on a grid)
conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size)
errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B
Returns:
segment: A chromosome with each position having a given genotype with a given site depth and allele depth
segment_error : A chromosome with each position having a given genotype with a given site depth and allele depth considering the error rates errA and errB
'''
# Create an empty list to store the chromosome
segment = []
# Create an empty list to store the chromosome with error
segment_error = []
breakpoints = []
# Iterate through the list of markers positions (fixed)
previousMarkerGenotype = ""
for i in range(0, len(markers_positions)):
recomb1 = False
recomb2 = False
# If the current marker is not the first one
if i > 0:
# Calculate the interval between the current marker and the previous one
#using the Kosambi inverse function to estimate the chance of recombination
IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor ## Conversion factor is used to have the bp per cM
IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
rnd = random.random()
# If the random number is greater than the probability of a recombination a recombination occurs
# Done for genotype 1
if rnd > 1 - IntervalWithPreviousMarkerInRF:
recomb1 = True
# If the previous marker is A, change the genotype to B
if genotype1 == "A":
genotype1 = "B"
else :
genotype1 = "A"
# Done for genotype 2
rnd = random.random()
if rnd > 1 - IntervalWithPreviousMarkerInRF:
recomb2 = True
# If the previous marker is B, change the genotype to A
if genotype2 == "A":
genotype2 = "B"
else :
genotype2 = "A"
# If the current marker is the first one, Set the two first genotypes (random).
else:
rnd = random.random()
# If the random number is greater than 0.5
if rnd > 0.5:
genotype1 = "A"
else :
genotype1 = "B"
rnd = random.random()
if rnd > 0.5:
genotype2 = "B"
else :
genotype2 = "A"
# initialize previousMarkerGenotype
if genotype1 == "A" and genotype2 == "A":
previousMarkerGenotype = "A"
elif genotype1 == "B" and genotype2 == "B":
previousMarkerGenotype = "B"
else :
previousMarkerGenotype = "H"
#Breakpoints
if recomb1 or recomb2:
#this is a breakpoint [beforeBkpAfter, transition]
transition = previousMarkerGenotype + " => "
if genotype1 == "A" and genotype2 == "A":
previousMarkerGenotype = "A"
elif genotype1 == "B" and genotype2 == "B":
previousMarkerGenotype = "B"
else :
previousMarkerGenotype = "H"
transition = transition + previousMarkerGenotype
breakpoints.append([i,transition])
# Calculate the site depth of the current marker
g = np.random.poisson(mean_depth)
# If the depth of the current marker is less than 0 (safeguard)
if g < 0:
g = 0
# Round the depth of the current marker
site_depth = round(g)
# Initialize x and y (with and without error)
x = 0
y = 0
x_error = 0
y_error = 0
# If the depth of the current marker is 0, genotype is Missing Data
if site_depth == 0:
genotype = "./."
genotype_error = "./."
else :
# If the site is homozygous A (ref, 0/0)
if genotype1 == "A" and genotype2 == "A":
x = site_depth
y = 0
genotype = "0/0"
x_error = 0
y_error = 0
# Considering the sequencing and mapping error at each site for a given site depth
for j in range(0,site_depth):
rnd = random.random()
# If the random number is smaller the error rate, we increase the wronge allele depth
if rnd < errA:
x_error = x_error + 0
y_error = y_error + 1
else :
x_error = x_error + 1
y_error = y_error + 0
if y_error == site_depth:
genotype_error = "1/1"
elif y_error > 0:
genotype_error = "0/1"
else :
genotype_error = "0/0"
# If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1)
elif genotype1 == "B" and genotype2 == "B":
x = 0
y = site_depth
genotype = "1/1"
x_error = 0
y_error = 0
# Considering the sequencing and mapping error at each site for a given site depth
for j in range(0,site_depth):
rnd = random.random()
# If the random number is smaller the error rate, we increase the wronge allele depth
if rnd < errB:
x_error = x_error + 1
y_error = y_error + 0
else :
x_error = x_error + 0
y_error = y_error + 1
if x_error == site_depth:
genotype_error = "0/0"
elif x_error > 0:
genotype_error = "0/1"
else :
genotype_error = "1/1"
# If the current marker is neither A nor B, genotype if heterozygous H (0/1)
else:
# Generate a random number between 0 and the depth of the current marker
x = random.randint(0,site_depth)
y = site_depth - x
# Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error
x_error = x
y_error = y
# If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1)
if x == 0:
genotype = "1/1"
genotype_error = "1/1"
# If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0)
elif y == 0:
genotype = "0/0"
genotype_error = "0/0"
# If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1)
else :
genotype = "0/1"
genotype_error = "0/1"
#print("reads",genotype1, genotype2)
#print(genotype)
# Append the genotype and the depth of the current marker to the segment list
finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."
segment.append(finalGenotype)
# Append the genotype and the depth of the current marker to the segment_error list
finalGenotype_error = str(genotype_error) + ":" + str(site_depth) + ":" + str(x_error) + "," + str(y_error) + ":.:.:.:.:."
segment_error.append(finalGenotype_error)
return segment, segment_error, breakpoints
# Generate a list of individuals
def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB):
matrix = []
matrix_error = []
# Calculate the number of marker requiered
markers_nb = round(chromosome_size * marker_density)
# Generate a sorted list of markers positions
#markers_positions = sorted(random.sample(range(chromosome_size),size))
markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int")
with open('Breakpoints_3x.csv', 'w') as breakpointFile:
breakpointFile.write(",".join(["sample","average_bkp_position", "bkp_start_position", "bkp_stop_position", "transitionType"]) + "\n");
for i in range(nb_individuals):
print("individual " + str(i+1))
result = generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor, errA, errB)
matrix.append(result[0])
matrix_error.append(result[1])
for bkp in result[2]:
startBkp = markers_positions[bkp[0] - 1]
stopBkp = markers_positions[bkp[0]]
averageBkpPosition = startBkp + round((stopBkp - startBkp)/2)
bkp_row = [str(i), str(averageBkpPosition), str(startBkp), str(stopBkp), bkp[1]]
breakpointFile.write(",".join(bkp_row) + "\n");
return matrix, matrix_error, markers_positions
# # Check if a row is correct
# def is_correct (row):
# l = len(row)
# count_h = row.count("H") / l
# count_a = row.count("A") / l
# count_b = row.count("B") / l
# return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]
# Set the parameters for the script
nb_individuals = 100
chromosome_size = 44000000
cMsize = 180 ## Size of genetic map
conversion_factor = chromosome_size/cMsize ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division!
marker_density = 0.0055
mean_depth = 1.5
max_depth = 3 #TODO (better estimate of max?)
errA = 0.005
errB = 0.005
## Generate the pop and associate the results to the matrixes with and without errors
pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB)
matrix = pop[0]
matrix_error = pop[1]
markers_positions = pop[2]
# Header to add to the VCF for it to be recognized as such file.
header = [
"##fileformat=VCFv4.1\n",
"##fileDate=20090805\n",
"##source=myImputationProgramV3.1\n",
"##reference=/shared/projects/recombinationlandscape/REFERENCE_GENOME/Osat_Azucena_AGI_chrOK_uniline_WithNIPBARorganelles.fasta\n",
"##contig=<ID=chr01,length=44011168>\n",
"##phasing=partial\n",
"##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">\n",
"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n",
"##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n",
"##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n",
"##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership, build 129\">\n",
"##INFO=<ID=H2,Number=0,Type=Flag,Description=\"HapMap2 membership\">\n",
"##FILTER=<ID=q10,Description=\"Quality below 10\">\n",
"##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">\n",
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n",
"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n",
"##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"segment Quality\">\n"
]
## Write the VCF files (with and without errors)
with open('test_3x.vcf', 'w') as file:
# Write the header of the file
file.writelines(header)
# Write the header of the file
file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"]) + "\n")
# Iterate over each row in the table
for m in range(0, len(matrix[0])):
row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
for i in range(0, len(matrix)):
row.append(matrix[i][m])
row.append("0/0:"+ str(round(mean_depth)) + ":" + str(round(mean_depth)) + ",0:.:.:.:.")
row.append("1/1:"+ str(round(mean_depth)) + ":0," + str(round(mean_depth)) + ":.:.:.:.")
file.write("\t".join(row) + "\n");
with open('test_error_3x.vcf', 'w') as file:
# Write the header of the file
file.writelines(header)
# Write the header of the file
file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"]) + "\n")
# Iterate over each row in the table
for m in range(0, len(matrix_error[0])):
row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
for i in range(0, len(matrix_error)):
row.append(matrix_error[i][m])
row.append("0/0:"+ str(round(mean_depth)) + ":" + str(round(mean_depth)) + ",0:.:.:.:.")
row.append("1/1:"+ str(round(mean_depth)) + ":0," + str(round(mean_depth)) + ":.:.:.:.")
file.write("\t".join(row) + "\n");