Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • diade/recombination_landscape/popsimul
1 result
Show changes
Commits on Source (34)
# PopSimul
## Authors
Cécile Triay (IRD)
Alice Boizet (CIRAD)
Mathias Lorieux (IRD)
Mathieu Triay (Atelier Triay)
## Description
This tool enables to simulate a VCF of a bi-parental population. It can be used to produce a VCF under controlled conditions with setting parameters such as average depth, marker density and expected error rate. It produces 2 VCF files with the simulated genotypes, one with simulated data, and one with the same simulated data but with simulated genotyping errors.
The Breakpoints csv file contains the positions of exact genotypes transitions based on the simulated data. This tool has been used to test NOISYmputer (https://gitlab.cirad.fr/noisymputer/noisymputerstandalone) to compare exact Breakpoints position to Breakpoints positions determined on imputed data.
## Requirements
python 3
## Running popsimul
``` python popsimul.py -o test1511.vcf -ni 10 -s 44000000 -cM 180 -nm 220000 -a 3 -m 10 -eA 0.03 -eB 0.05```
### Parameters description
You can see all parameters by running this command
``` python popsimul.py -h```
| Short | Long | Description | Default value |
|---|---|---|---|
| -o | --output | Name for the 3 generated files : [output].vcf, [output]_errors.vcf, [output]_Breakpoints.csv | popsimul |
| -ni | --nb_individuals | number of individuals | 10 |
| -bp | --chr_bp | chromosome size | 44000000|
| -cM | --cM_size | size of genetic map | 180 |
| -nm | --nb_markers | number of markers| 220000 |
| -a | --average_depth | average depth | 3 |
| -m | --max_depth | max depth | 10 |
| -eA | --error_rate_A | error rate for A | 0.05 |
| -eB | --error_rate_B | error rate for B | 0.05 |
import random
import math
import numpy as np
import argparse
#def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions, conversion_factor):
def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, errA, errB):
'''
Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).
Args:
mean_depth (float): The mean sequencing depth of the chromosome
markers_positions (lst) : liste of marker positions (picked on a grid)
conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size)
errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B
Returns:
segment: An array containing for each snp of the chromosome, this array : [initial_gt, genotype, genotype_error, site_depth, x_error, y_error]
- intial_gt : simulated genotype with no noise and no sequencing error
- genotype : simulated genotype with noise
- genotype_error : simulated genotype with noise and sequencing error
- site_depth : simulated read depth
- x_error : simulated allelic depth for A
- y_error : simulated allelic depth for B
'''
# Create an empty list to store the chromosome
segment = []
breakpoints = []
# Iterate through the list of markers positions (fixed)
previous_marker_gt = ""
for i in range(0, len(markers_positions)):
recomb1 = False
recomb2 = False
# If the current marker is not the first one
if i > 0:
# Calculate the interval between the current marker and the previous one
#using the Kosambi inverse function to estimate the chance of recombination
IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor ## Conversion factor is used to have the bp per cM
IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
rnd = random.random()
# If the random number is greater than the probability of a recombination a recombination occurs
# Done for genotype 1
if rnd > 1 - IntervalWithPreviousMarkerInRF:
recomb1 = True
# If the previous marker is A, change the genotype to B
if genotype1 == "A":
genotype1 = "B"
else :
genotype1 = "A"
# Done for genotype 2
rnd = random.random()
if rnd > 1 - IntervalWithPreviousMarkerInRF:
recomb2 = True
# If the previous marker is B, change the genotype to A
if genotype2 == "A":
genotype2 = "B"
else :
genotype2 = "A"
# If the current marker is the first one, Set the two first genotypes (random).
else:
rnd = random.random()
# If the random number is greater than 0.5
if rnd > 0.5:
genotype1 = "A"
else :
genotype1 = "B"
rnd = random.random()
if rnd > 0.5:
genotype2 = "B"
else :
genotype2 = "A"
# initialize previous_marker_gt
if genotype1 == "A" and genotype2 == "A":
previous_marker_gt = "A"
elif genotype1 == "B" and genotype2 == "B":
previous_marker_gt = "B"
else :
previous_marker_gt = "H"
# Breakpoints for this individual
if recomb1 or recomb2:
#this is a breakpoint [beforeBkpAfter, transition]
transition = previous_marker_gt + " => "
if genotype1 == "A" and genotype2 == "A":
previous_marker_gt = "A"
elif genotype1 == "B" and genotype2 == "B":
previous_marker_gt = "B"
else :
previous_marker_gt = "H"
transition = transition + previous_marker_gt
breakpoints.append([i,transition])
# Calculate the site depth of the current marker
g = np.random.poisson(mean_depth)
# If the depth of the current marker is less than 0 (safeguard)
if g < 0:
g = 0
# Round the depth of the current marker
site_depth = round(g)
# Initialize x and y (with and without error)
x = 0
y = 0
x_error = 0
y_error = 0
# If the site is homozygous A (ref, 0/0)
if genotype1 == "A" and genotype2 == "A":
x = site_depth
y = 0
initial_gt = "0/0"
genotype = "0/0"
x_error = 0
y_error = 0
# Considering the sequencing and mapping error at each site for a given site depth
for j in range(0, site_depth):
rnd = random.random()
# If the random number is smaller the error rate, we increase the wronge allele depth
if rnd < errA:
x_error = x_error + 0
y_error = y_error + 1
else :
x_error = x_error + 1
y_error = y_error + 0
if y_error == site_depth:
genotype_error = "1/1"
elif y_error > 0:
genotype_error = "0/1"
else :
genotype_error = "0/0"
# If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1)
elif genotype1 == "B" and genotype2 == "B":
x = 0
y = site_depth
initial_gt = "1/1"
genotype = "1/1"
x_error = 0
y_error = 0
# Considering the sequencing and mapping error at each site for a given site depth
for j in range(0,site_depth):
rnd = random.random()
# If the random number is smaller the error rate, we increase the wronge allele depth
if rnd < errB:
x_error = x_error + 1
y_error = y_error + 0
else :
x_error = x_error + 0
y_error = y_error + 1
if x_error == site_depth:
genotype_error = "0/0"
elif x_error > 0:
genotype_error = "0/1"
else :
genotype_error = "1/1"
# If the current marker is neither A nor B, genotype is heterozygous H (0/1)
else:
initial_gt = "0/1"
# Generate a random number between 0 and the depth of the current marker
x = random.randint(0,site_depth)
y = site_depth - x
# Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error
x_error = x
y_error = y
# If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1)
if x == 0:
genotype = "1/1"
genotype_error = "1/1"
# If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0)
elif y == 0:
genotype = "0/0"
genotype_error = "0/0"
# If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1)
else :
genotype = "0/1"
genotype_error = "0/1"
if site_depth == 0:
genotype = "./."
genotype_error = "./."
# data is an array that stores genotypes and depth for one snp and one individual
data = [initial_gt , genotype, genotype_error, site_depth, x_error, y_error]
segment.append(data)
return segment, breakpoints
# Generate a list of individuals
def generate_individuals (nb_individuals, chromosome_size, nb_markers, mean_depth, conversion_factor, errA, errB):
matrix = [] #matrix is an array of genotypes for each individual
#matrix_error = []
# Generate a sorted list of markers positions
#markers_positions = sorted(random.sample(range(chromosome_size),size))
markers_positions = np.linspace(1, chromosome_size, nb_markers, dtype="int")
with open(args.output + '_Breakpoints.csv', 'w') as breakpointFile:
breakpointFile.write(",".join(["sample","average_bkp_position", "bkp_start_position", "bkp_stop_position", "transitionType"]) + "\n");
for i in range(nb_individuals):
print("individual " + str(i+1) + " / " + str(nb_individuals), end="\r")
bkps_nb = 0
result = []
while bkps_nb < 1 : #resimulate until we get at least one crossing over for this individual
result = generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor, errA, errB)
bkps_nb = len(result[1])
matrix.append(result[0])
for bkp in result[1]:
startBkp = markers_positions[bkp[0] - 1]
stopBkp = markers_positions[bkp[0]]
averageBkpPosition = startBkp + round((stopBkp - startBkp)/2)
bkp_row = [str(i), str(averageBkpPosition), str(startBkp), str(stopBkp), bkp[1]]
breakpointFile.write(",".join(bkp_row) + "\n");
print("The file", args.output + "_Breakpoint.csv" , "has been generated")
return matrix, markers_positions
# Write VCF file
def write_vcf(output_file_name, matrix, gt_data_position, with_depth):
# Header to add to the VCF for it to be recognized as such file.
header = [
"##fileformat=VCFv4.1\n",
"##fileDate=20090805\n",
"##source=myImputationProgramV3.1\n",
"##reference=/shared/projects/recombinationlandscape/REFERENCE_GENOME/Osat_Azucena_AGI_chrOK_uniline_WithNIPBARorganelles.fasta\n",
"##contig=<ID=chr01,length=44011168>\n",
"##phasing=partial\n",
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n",
"##FORMAT=<ID=AD,Number=2,Type=Integer,Description=\"Allelic Depth\">\n"
]
nb_individuals = len(matrix)
nb_snp = len(matrix[0])
with open(output_file_name + '.vcf', 'w') as file:
# Write the header of the file
file.writelines(header)
# Write the header of the file
file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"]) + "\n")
# Iterate over each row in the table
for m in range(0, nb_snp):
if with_depth:
format_info = "GT:DP:AD"
else :
format_info = "GT"
row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", format_info]
# add all individual genotypes
for i in range(0, nb_individuals):
vcf_data = str(matrix[i][m][gt_data_position])
if with_depth:
vcf_data = vcf_data + ":" + str(matrix[i][m][3]) + ":" + str(matrix[i][m][4]) + "," + str(matrix[i][m][5])
row.append(vcf_data)
# add parent genotypes
if with_depth:
p1_data = "0/0:"+ str(round(mean_depth)) + ":" + str(round(mean_depth)) + ",0"
p2_data = "1/1:"+ str(round(mean_depth)) + ":0," + str(round(mean_depth))
else :
p1_data = "0/0"
p2_data = "1/1"
row.append(p1_data)
row.append(p2_data)
file.write("\t".join(row) + "\n")
# # Check if a row is correct
# def is_correct (row):
# l = len(row)
# count_h = row.count("H") / l
# count_a = row.count("A") / l
# count_b = row.count("B") / l
# return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]
# define commandline arguments
parser = argparse.ArgumentParser(description='PopSimul')
parser.add_argument('-o', '--output', help='Output files name', default="popsimul")
parser.add_argument('-ni', '--nb_individuals', help='Number of individuals', default=10)
parser.add_argument('-bp', '--chr_bp', help='Chromosome size in bp', default=44000000)
parser.add_argument('-cM', '--cM_size', help='Size of genetic map', default=180)
parser.add_argument('-nm', '--nb_markers', help='Number of markers', default=220000)
parser.add_argument('-a', '--average_depth', help='Depth average', default=3)
parser.add_argument('-m', '--max_depth', help='Maximum possible depth', default=10)
parser.add_argument('-eA', '--error_rate_A', help='Error rate', default=0.05)
parser.add_argument('-eB', '--error_rate_B', help='Error rate', default=0.05)
args = parser.parse_args()
# Set the parameters for the script
nb_individuals = int(args.nb_individuals)
chromosome_size = int(args.chr_bp)
cMsize = int(args.cM_size) ## Size of genetic map
conversion_factor = chromosome_size/cMsize ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division!
#conversion_factor = 233000
#marker_density = 255000/44000000
nb_markers = int(args.nb_markers)
mean_depth = float(args.average_depth)
max_depth = float(args.max_depth) #TODO better estimate for max ?
errA = float(args.error_rate_A)
errB = float(args.error_rate_B)
print("-----USED PARAMETERS-----")
print("nb_individuals =",str(nb_individuals))
print("chr_bp =",str(chromosome_size))
print("cMsize =",str(cMsize))
print("nb_markers =",str(nb_markers))
print("average_depth =",str(mean_depth))
print("max_depth =",str(max_depth))
print("error_rate_A =",str(errA))
print("error_rate_B =",str(errB))
print("------------------------")
## Generate the pop and associate the results to the matrixes with and without errors
pop = generate_individuals(nb_individuals, chromosome_size, nb_markers, mean_depth, conversion_factor, errA, errB)
matrix = pop[0]
markers_positions = pop[1]
# Header to add to the VCF for it to be recognized as such file.
header = [
"##fileformat=VCFv4.1\n",
"##fileDate=20090805\n",
"##source=myImputationProgramV3.1\n",
"##reference=/shared/projects/recombinationlandscape/REFERENCE_GENOME/Osat_Azucena_AGI_chrOK_uniline_WithNIPBARorganelles.fasta\n",
"##contig=<ID=chr01,length=44011168>\n",
"##phasing=partial\n",
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n",
"##FORMAT=<ID=AD,Number=2,Type=Integer,Description=\"Allelic Depth\">\n"
]
## Write simulated VCF file with no noise and no sequencing error
write_vcf(args.output, matrix, 0, True)
print("The file", args.output + ".vcf has been generated")
## Write simulated VCF file with noise but no sequencing error
filename = args.output + "_noise"
write_vcf(filename, matrix, 1, True)
print("The file", filename + ".vcf has been generated")
## Write simulated VCF file with noise and sequencing error
filename = args.output + "_noise_errors"
write_vcf(filename, matrix, 2, True)
print("The file", filename + ".vcf has been generated")
This diff is collapsed.