Skip to content
Snippets Groups Projects
Commit ea5a11f7 authored by alice.boizet_cirad.fr's avatar alice.boizet_cirad.fr
Browse files

Merge branch 'comments' into 'master'

Comments

See merge request noisymputer/popsimul!1
parents 5ff25ba1 e9b3ba36
No related branches found
No related tags found
No related merge requests found
import random
import csv
import math
import numpy as np
def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions):
# '''Generate a random segment from a given genotype.
# Args:
# genotype (str): The genotype to generate a segment from.
# size (int): The length of segments to generate (number of sites).
# err_rate (float): The probability at a site that the segment's genotype will be replaced by a random other genotype.
# Returns:
# str: A random segment from the given genotype.
# '''
def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, sd_depth):
# Create a list of all possible genotypes
#genotypes = ["A", "B", "H", "H"]
'''
Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).
Args:
chromosome_size (int): The size of the chromosome.
marker_density (float): The density of markers.
err_rate (float): The error rate of the chromosome.
mean_depth (float): The mean depth of the chromosome.
max_depth
markers_positions
# Create a list of all other genotypes that are not the given genotype
#other_genotypes = [g for g in genotypes if g!= genotype]
Returns:
list: The chromosome.
'''
segment = []
# random marker positions
size = round(chromosome_size * marker_density)
positions = sorted(random.sample(range(chromosome_size),size)) #sorted list of markers positions
for i in range(0, size):
# Iterate through the list of markers positions
for i in range(0, len(markers_positions)):
# If the current marker is not the first one
if i > 0:
# Calculate the interval between the current marker and the previous one
#TODO
#Kosambi inverse function
IntervalWithPreviousMarker = positions[i] - positions[i-1]
temp = 2 * (IntervalWithPreviousMarker / 100)
IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(temp) - math.exp(-temp)) / (math.exp(temp) + math.exp(-temp)))
IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
tempcM = 2 * (IntervalWithPreviousMarker / 100) / len(markers_positions) # markers_nb (= len(markers_positions)) corresponds to bpPercM
IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
rnd = random.random()
# If the random number is greater than the probability of a recombination
#print("before",genotype1, genotype2)
if rnd > 1 - IntervalWithPreviousMarkerInRF: # a recombination occurs
# If the previous marker is A, change the genotype to B
if genotype1 == "A":
genotype1 = "B";
else :
genotype1 = "A";
rnd = random.random()
if rnd > 1 - IntervalWithPreviousMarkerInRF:
# If the previous marker is B, change the genotype to A
if genotype2 == "A":
genotype2 = "B";
else :
genotype1 = "A";
genotype2 = "A";
#print("after",genotype1, genotype2)
# If the current marker is the first one, fix the two first genotypes.
else:
# first marker
rnd = random.random()
# If the random number is greater than 0.5
if rnd > 0.5:
genotype1 = "A"
else :
genotype1 = "B"
rnd = random.random()
if rnd > 0.5:
genotype2 = "B"
else :
genotype2 = "A"
genotype1 = "B"
#TODO : ajouter erreur de séquençage
# sd : standard deviation
g = random.gauss(mean_depth, sd_depth)
#TODO : ajouter erreur de séquençage
# Calculate the depth of the current marker
g = np.random.poisson(mean_depth)
# If the depth of the current marker is less than 0 (safeguard)
if g < 0:
g = 0
# Round the depth of the current marker
site_depth = round(g)
# Initialize x and y
x = 0
y = 0
# If the depth of the current marker is 0, genotype is Missing Data
if site_depth == 0:
genotype = "./."
else :
# If the current marker is A and the previous marker is A, genotype if homozygote A (ref, 0/0)
if genotype1 == "A" and genotype2 == "A":
x = site_depth
y = 0
genotype = "0/0"
# If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1)
elif genotype1 == "B" and genotype2 == "B":
x = 1
y = site_depth
genotype = "1/1"
# If the current marker is neither A nor B, genotype if heterozygous H (0/1)
else:
#x = random.gauss(site_depth, sd2)
# Generate a random number between 0 and the depth of the current marker
x = random.randint(0,site_depth)
y = site_depth - x
# If the depth of x of the current marker is 0, seen as homozygous
if x == 0:
genotype = "1/1"
# If the depth of y of the current marker is not 0, seen as homozygous
elif y == 0:
genotype = "0/0"
# If the depth x and y of the current marker is not 0, seen as heterozygous
else :
genotype = "0/1"
finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."
#print("reads",genotype1, genotype2)
#print(genotype)
# Append the genotype and the depth of the current marker to the segment list
finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:.:."
segment.append(finalGenotype)
# Return the random segment
return segment
# Generate a list of individuals
def generate_individuals (nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, sd_depth):
def generate_individuals (nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, max_depth):
matrix = []
# Calculate the number of marker requiered
markers_nb = round(chromosome_size * marker_density)
# Generate a sorted list of markers positions
# positions = sorted(random.sample(range(chromosome_size),size))
markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int")
for i in range(nb_individuals):
print("individual " + str(i+1))
matrix.append(generate_chr_for_one_ind(chromosome_size, marker_density, err_rate, mean_depth, sd_depth))
matrix.append(generate_chr_for_one_ind(chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions))
return matrix
return matrix, markers_positions
# Check if a row is correct
def is_correct (row):
l = len(row)
# # Check if a row is correct
# def is_correct (row):
# l = len(row)
count_h = row.count("H") / l
count_a = row.count("A") / l
count_b = row.count("B") / l
# count_h = row.count("H") / l
# count_a = row.count("A") / l
# count_b = row.count("B") / l
return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]
# return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]
chromosome_size = 100000
nb_individuals = 100
# Set the parameters for the script
nb_individuals = 3
err_rate = 0.02
chromosome_size = 44000000
marker_density = 255000/44000000
mean_depth = 3
sd_depth = 1
pop = generate_individuals(nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, sd_depth)
# # Create an empty list to store the individuals
# rows = []
# # Loop through the population
# for i in range(chromosome_size):
# # Create an empty list to store the individual
# row = []
# # Loop through the individuals in the population
# for ind in pop:
# # Append the individual's chromosome to the row
# row.append(ind[i])
# # Append the row to the list of rows
# #row.append(is_correct(row))
# rows.append(row)
max_depth = 10 #TODO
pop = generate_individuals(nb_individuals, chromosome_size, err_rate, marker_density, mean_depth, max_depth)
matrix = pop[0]
markers_positions = pop[1]
# Header to add to the VCF for it to be recognized as such file.
header = [
"##fileformat=VCFv4.1\n",
"##fileDate=20090805\n",
......@@ -157,9 +174,8 @@ header = [
"##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"segment Quality\">\n"
]
#with open('test.vcf', 'w') as file:
with open('test.tsv', 'w') as file:
with open('test.vcf', 'w') as file:
# Write the header of the file
file.writelines(header)
......@@ -167,26 +183,10 @@ with open('test.tsv', 'w') as file:
file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"] + ["Test"]) + "\n")
# Iterate over each row in the table
#for i, row in enumerate(rows):
for m in range(0, len(pop[0])):
row = ["chr01", str(m + 1), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
for i in range(0, len(pop)):
row.append(pop[i][m])
for m in range(0, len(matrix[0])):
row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
for i in range(0, len(matrix)):
row.append(matrix[i][m])
row.append("0/0:"+ str(mean_depth) + ":" + str(mean_depth) + ",0:.:.:.:.:.")
row.append("1/1:"+ str(mean_depth) + ":0," + str(mean_depth) + ":.:.:.:.:.")
file.write("\t".join(row) + "\n");
# # Create an empty list to store the converted row
# converted_row = []
# # Iterate over each genotype in the row
# for genotype in ind:
# converted_row.append(genotype + ":.:.:.:.:.")
# file.write("\t".join(["chr01", str(i + 1), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"] + converted_row + ["0/0:"+ mean_depth + ":" + mean_depth + ",0:.:.:.:.:.", "1/1:"+ mean_depth + ":0," + mean_depth + ":.:.:.:.:."]) + "\n" )
##import random; random.sample(range(101), 10) ##10 random values in a range of 100 without duplicate.
\ No newline at end of file
file.write("\t".join(row) + "\n");
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment