noisyplotype.py

import random
import math
import numpy as np


#def generate_chr_for_one_ind (chromosome_size, marker_density, err_rate, mean_depth, max_depth, markers_positions, conversion_factor):
def generate_chr_for_one_ind (mean_depth, markers_positions, conversion_factor, errA, errB):
    '''
    Generate a chromosome for a given size, density of markers, and depth (follwing a gaussian distrib with a mean_depth and sd_depth).

    Args:
        mean_depth (float): The mean sequencing depth of the chromosome
        markers_positions (lst) : liste of marker positions (picked on a grid)
        conversion_factor (float) : Value of the bp per cM conversion (bp chromosome size / cM chromosome size)
        errA & errB (float): Respectively the error rate of observing a B whereas the genotype is truely a A and a A whereas the genotype is truely a B

    Returns:
        segment: A chromosome with each position having a given genotype with a given site depth and allele depth
        segment_error : A chromosome with each position having a given genotype with a given site depth and allele depth considering the error rates errA and errB
    '''

    # Create an empty list to store the chromosome
    segment = []
    # Create an empty list to store the chromosome with error
    segment_error = []
    breakpoints = []

    # Iterate through the list of markers positions (fixed)
    previousMarkerGenotype = ""
    for i in range(0, len(markers_positions)):
        recomb1 = False
        recomb2 = False
        # If the current marker is not the first one
        if i > 0:
            # Calculate the interval between the current marker and the previous one
            #using the Kosambi inverse function to estimate the chance of recombination
            IntervalWithPreviousMarker = markers_positions[i] - markers_positions[i-1]
            tempcM = 2 * (IntervalWithPreviousMarker / 100) / conversion_factor         ## Conversion factor is used to have the bp per cM
            IntervalWithPreviousMarkerInRF = 0.5 * ((math.exp(tempcM) - math.exp(-tempcM)) / (math.exp(tempcM) + math.exp(-tempcM)))
            rnd = random.random()

            # If the random number is greater than the probability of a recombination a recombination occurs
            # Done for genotype 1
            if rnd > 1 - IntervalWithPreviousMarkerInRF:
                recomb1 = True
                # If the previous marker is A, change the genotype to B
                if genotype1 == "A":
                    genotype1 = "B"
                else :
                    genotype1 = "A"

            # Done for genotype 2
            rnd = random.random()
            if rnd > 1 - IntervalWithPreviousMarkerInRF:
                recomb2 = True
                # If the previous marker is B, change the genotype to A
                if genotype2 == "A":
                    genotype2 = "B"
                else :
                    genotype2 = "A"

        # If the current marker is the first one, Set the two first genotypes (random).
        else:
            rnd = random.random()
            # If the random number is greater than 0.5
            if rnd > 0.5:
                genotype1 = "A"
            else :
                genotype1 = "B"

            rnd = random.random()
            if rnd > 0.5:
                genotype2 = "B"
            else :
                genotype2 = "A"

            # initialize previousMarkerGenotype
            if genotype1 == "A" and genotype2 == "A":
                previousMarkerGenotype = "A"
            elif genotype1 == "B" and genotype2 == "B":
                previousMarkerGenotype = "B"
            else :
                previousMarkerGenotype = "H"

        #Breakpoints
        if recomb1 or recomb2:
            #this is a breakpoint [beforeBkpAfter, transition]
            transition = previousMarkerGenotype + " => "

            if genotype1 == "A" and genotype2 == "A":
                previousMarkerGenotype = "A"
            elif genotype1 == "B" and genotype2 == "B":
                previousMarkerGenotype = "B"
            else :
                previousMarkerGenotype = "H"

            transition = transition + previousMarkerGenotype

            breakpoints.append([i,transition])

        # Calculate the site depth of the current marker
        g = np.random.poisson(mean_depth)
        # If the depth of the current marker is less than 0 (safeguard)
        if g < 0:
            g = 0
        # Round the depth of the current marker
        site_depth = round(g)
        # Initialize x and y (with and without error)
        x = 0
        y = 0
        x_error = 0
        y_error = 0

        # If the depth of the current marker is 0, genotype is Missing Data
        if site_depth == 0:
            genotype = "./."
            genotype_error = "./."
        else :
            # If the site is homozygous A (ref, 0/0)
            if genotype1 == "A" and genotype2 == "A":
                x = site_depth
                y = 0
                genotype = "0/0"
                x_error = 0
                y_error = 0
                # Considering the sequencing and mapping error at each site for a given site depth
                for j in range(0,site_depth):
                    rnd = random.random()
                    # If the random number is smaller the error rate, we increase the wronge allele depth
                    if rnd < errA:
                        x_error = x_error + 0
                        y_error = y_error + 1
                    else :
                        x_error = x_error + 1
                        y_error = y_error + 0
                if y_error == site_depth:
                    genotype_error = "1/1"
                elif y_error > 0:
                    genotype_error = "0/1"
                else :
                    genotype_error = "0/0"

            # If the current marker is B and the previous marker is B, genotype if homozygote B (alt, 1/1)
            elif genotype1 == "B" and genotype2 == "B":
                x = 0
                y = site_depth
                genotype = "1/1"
                x_error = 0
                y_error = 0
                # Considering the sequencing and mapping error at each site for a given site depth
                for j in range(0,site_depth):
                    rnd = random.random()
                    # If the random number is smaller the error rate, we increase the wronge allele depth
                    if rnd < errB:
                        x_error = x_error + 1
                        y_error = y_error + 0
                    else :
                        x_error = x_error + 0
                        y_error = y_error + 1
                if x_error == site_depth:
                    genotype_error = "0/0"
                elif x_error > 0:
                    genotype_error = "0/1"
                else :
                    genotype_error = "1/1"

            # If the current marker is neither A nor B, genotype if heterozygous H (0/1)
            else:
                # Generate a random number between 0 and the depth of the current marker
                x = random.randint(0,site_depth)
                y = site_depth - x
                # Error of A and B compensate themselves in heterozygous site so no need to change the x_error and y_error
                x_error = x
                y_error = y
                # If the depth of x of the current marker is 0, it's seen as homozygous site (alt, 1/1)
                if x == 0:
                    genotype = "1/1"
                    genotype_error = "1/1"
                # If the depth of x of the current marker is 0, it's seen as homozygous site (ref, 0/0)
                elif y == 0:
                    genotype = "0/0"
                    genotype_error = "0/0"
                # If the depth x and y of the current marker is not 0, it's seen as heterozygous (0/1)
                else :
                    genotype = "0/1"
                    genotype_error = "0/1"

        #print("reads",genotype1, genotype2)
        #print(genotype)
        # Append the genotype and the depth of the current marker to the segment list
        finalGenotype = str(genotype) + ":" + str(site_depth) + ":" + str(x) + "," + str(y) + ":.:.:.:.:."
        segment.append(finalGenotype)

        # Append the genotype and the depth of the current marker to the segment_error list
        finalGenotype_error = str(genotype_error) + ":" + str(site_depth) + ":" + str(x_error) + "," + str(y_error) + ":.:.:.:.:."
        segment_error.append(finalGenotype_error)

    return segment, segment_error, breakpoints

# Generate a list of individuals
def generate_individuals (nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB):
    matrix = []
    matrix_error = []

    # Calculate the number of marker requiered
    markers_nb = round(chromosome_size * marker_density)

    # Generate a sorted list of markers positions
    #markers_positions = sorted(random.sample(range(chromosome_size),size))
    markers_positions = np.linspace(1, chromosome_size, markers_nb, dtype="int")

    with open('Breakpoints_3x.csv', 'w') as breakpointFile:
        breakpointFile.write(",".join(["sample","average_bkp_position", "bkp_start_position", "bkp_stop_position", "transitionType"]) + "\n");
        for i in range(nb_individuals):
            print("individual " + str(i+1))
            result = generate_chr_for_one_ind(mean_depth, markers_positions, conversion_factor, errA, errB)
            matrix.append(result[0])
            matrix_error.append(result[1])
            for bkp in result[2]:
                startBkp = markers_positions[bkp[0] - 1]
                stopBkp = markers_positions[bkp[0]]
                averageBkpPosition = startBkp + round((stopBkp - startBkp)/2)
                bkp_row = [str(i), str(averageBkpPosition), str(startBkp), str(stopBkp), bkp[1]]
                breakpointFile.write(",".join(bkp_row) + "\n");

    return matrix, matrix_error, markers_positions

# # Check if a row is correct
# def is_correct (row):
#     l = len(row)

#     count_h = row.count("H") / l
#     count_a = row.count("A") / l
#     count_b = row.count("B") / l

#     return [count_a, count_b, count_h, 0.45 <= count_h <= 0.55 and 0.2 <= count_b <= 0.3 and 0.2 <= count_a <= 0.3]

# Set the parameters for the script
nb_individuals = 100
chromosome_size = 44000000
cMsize = 180    ## Size of genetic map
conversion_factor = chromosome_size/cMsize   ## Corresponds to a bpPercM conversion ! Needs to be fixed... Does not produce the correct division!
marker_density = 0.0055
mean_depth = 1.5
max_depth = 3 #TODO (better estimate of max?)
errA = 0.005
errB = 0.005

## Generate the pop and associate the results to the matrixes with and without errors
pop = generate_individuals(nb_individuals, chromosome_size, marker_density, mean_depth, conversion_factor, errA, errB)
matrix = pop[0]
matrix_error = pop[1]
markers_positions = pop[2]

# Header to add to the VCF for it to be recognized as such file.
header = [
"##fileformat=VCFv4.1\n",
"##fileDate=20090805\n",
"##source=myImputationProgramV3.1\n",
"##reference=/shared/projects/recombinationlandscape/REFERENCE_GENOME/Osat_Azucena_AGI_chrOK_uniline_WithNIPBARorganelles.fasta\n",
"##contig=<ID=chr01,length=44011168>\n",
"##phasing=partial\n",
"##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">\n",
"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n",
"##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n",
"##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n",
"##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership, build 129\">\n",
"##INFO=<ID=H2,Number=0,Type=Flag,Description=\"HapMap2 membership\">\n",
"##FILTER=<ID=q10,Description=\"Quality below 10\">\n",
"##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">\n",
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n",
"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n",
"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n",
"##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"segment Quality\">\n"
]


## Write the VCF files (with and without errors)
with open('test_3x.vcf', 'w') as file:
    # Write the header of the file
    file.writelines(header)

    # Write the header of the file
    file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"]) + "\n")

    # Iterate over each row in the table
    for m in range(0, len(matrix[0])):
        row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
        for i in range(0, len(matrix)):
            row.append(matrix[i][m])
        row.append("0/0:"+ str(round(mean_depth)) + ":" + str(round(mean_depth)) + ",0:.:.:.:.")
        row.append("1/1:"+ str(round(mean_depth)) + ":0," + str(round(mean_depth)) + ":.:.:.:.")
        file.write("\t".join(row) + "\n");

with open('test_error_3x.vcf', 'w') as file:
    # Write the header of the file
    file.writelines(header)

    # Write the header of the file
    file.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + [str(n) for n in range(nb_individuals)] + ["Parent1", "Parent2"]) + "\n")

    # Iterate over each row in the table
    for m in range(0, len(matrix_error[0])):
        row = ["chr01", str(markers_positions[m]), ".", "A", "T", ".", ".", ".", "GT:DP:AD:RO:QR:AO:QA:GL"]
        for i in range(0, len(matrix_error)):
            row.append(matrix_error[i][m])
        row.append("0/0:"+ str(round(mean_depth)) + ":" + str(round(mean_depth)) + ",0:.:.:.:.")
        row.append("1/1:"+ str(round(mean_depth)) + ":0," + str(round(mean_depth)) + ":.:.:.:.")
        file.write("\t".join(row) + "\n");