Statistics_methylation_Oryza.py

####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
# Autor: Fadwa EL KHADDAR
# Lab : DIADE - IRD
# University : Montpellier - France

import os
import subprocess
import numpy as np
import pandas as pd
import glob
import sys
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import argparse
import re
from tkinter import *
import tkinter as tk
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

matplotlib.use('TkAgg')
# Arguments

parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
parser.add_argument('-fasta', type=str, help="Path to fasta file")
parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
parser.add_argument('-o', type=str, help='Output directory ')

args = parser.parse_args()


# Function definition

def CpG_context(df_chromosome):
    global CpG
    global nb_C
    global nb_CpG
    motif_counts = {}
    motif_sum = {}  # Dictionnary to store count all occurences
    motif_occurrences = {}  # Dictionnary to store count of each occurence
    positions_par_chromosome = {}
    for motif in CpG:
        motif_counts[motif] = 0
        motif_sum[motif] = 0
        motif_occurrences[motif] = []
    rows = []

    # Loop through k_mer column of treated dataset

    for i, row in treated_df.iterrows():
        kmer = row['k_mer']
        first_two_letters = kmer[:2]
        if "CG" in first_two_letters:
            motif_count = first_two_letters.count("CG")
            motif_counts["CG"] += motif_count
            if motif_count > 0:
                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
                                   re.finditer("CG", first_two_letters)]  # Store the positions of the motif
                motif_occurrences["CG"].extend(motif_positions)
                rows.append(row)
    rows_CG_df = pd.DataFrame(rows)
    rows_CG = rows_CG_df.sort_values("chromosome")
    output_file = os.path.join(output_dir, filebasename + "_CG.csv")
    rows_CG.to_csv(output_file, index=False, sep='\t')
    # if not rows_CG.empty:
    #    for chromosome in rows_CG['chromosome'].unique():
    #        positions_par_chromosome[chromosome] = list(rows_CG.loc[rows_CG['chromosome'] == chromosome, ['strand', 'position']])
    total_count_CpG = sum(motif_counts.values())
    pourcentage_CpG = (total_count_CpG / nb_CpG) * 100
    pourcentage_C = (total_count_CpG / nb_C) * 100

    print(" --- CpG STATISTICS --- ")
    print()
    print(f" * The number of occurrences of CpG methylation : {total_count_CpG} ")
    print(
        f" * The percentage of CpG methylation among all CpGs in the genome : {pourcentage_CpG : .2f} %")
    print(
        f" * The percentage of methylated cytosines (CpG type) among all cytosines in the genome : {pourcentage_C : .2f} %")
    print()

    return total_count_CpG


def CHG_context(df_chromosome):
    global CHG
    global nb_C
    global nb_CpG
    global output_dir
    global filebasename
    motif_counts = {}  # create dictionnary to store the times the motif appears in the sequence
    motif_sum = {}  #
    motif_occurrences = {}  # create a dictionary to store occurrences of each motif
    positions_par_chromosome = {}

    for motif in CHG:
        motif_counts[motif] = 0
        motif_sum[motif] = 0
        motif_occurrences[motif] = []  # initialize an empty list for each motif
    rows = []
    for i, row in treated_df.iterrows():
        kmer = row['k_mer']
        for motif in CHG:
            motif_count = kmer.count(motif)
            motif_counts[motif] += motif_count
            if motif_count > 0:
                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
                                   re.finditer(motif, kmer)]  # Store the positions of the motif
                motif_occurrences[motif].extend(motif_positions)
                rows.append(row)
    rows_CHG_df = pd.DataFrame(rows)
    rows_CHG = rows_CHG_df.sort_values("chromosome")
    output_file = os.path.join(output_dir, filebasename + "_CHG.csv")
    rows_CHG.to_csv(output_file, index=False, sep='\t')
    # if not rows_CHG.empty:
    #    for chromosome in rows_CHG['chromosome'].unique():
    #        positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, ['strand', 'position']])
    total_count_CHG = sum(motif_counts.values())
    pourcentage_C_total = (total_count_CHG / nb_C) * 100

    print(" --- CHG STATISTICS --- ")
    print()
    print(
        f"The percentage of methylated cytosines (CHG type) among all cytosines in the genome:  {pourcentage_C_total: .2f} %")
    print()
    print(f" The number of occurrences of CHG methylation : {total_count_CHG}, which includes : ")
    for motif in CHG:
        pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
        print(f"    -> {motif} apprears {motif_counts[motif]} times")
        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
        print()

    return total_count_CHG


def CHH_context(df_chromosome):
    global CHH
    global nb_C
    global nb_CpG
    global output_dir
    global filebasename
    motif_counts = {}  # create dictionnary to store the times the motif appears in the sequence
    motif_sum = {}  #
    motif_occurrences = {}  # create a dictionary to store occurrences of each motif
    positions_par_chromosome = {}

    for motif in CHH:
        motif_counts[motif] = 0
        motif_sum[motif] = 0
        motif_occurrences[motif] = []  # initialize an empty list for each motif
    rows = []
    for i, row in treated_df.iterrows():
        kmer = row['k_mer']
        for motif in CHH:
            motif_count = kmer.count(motif)
            motif_counts[motif] += motif_count
            if motif_count > 0:
                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
                                   re.finditer(motif, kmer)]  # Store the positions of the motif
                motif_occurrences[motif].extend(motif_positions)
                rows.append(row)
    rows_CHH_df = pd.DataFrame(rows)
    rows_CHH = rows_CHH_df.sort_values('chromosome')
    output_file = os.path.join(output_dir, filebasename + "_CHH.csv")
    rows_CHH.to_csv(output_file, index=False, sep='\t')
    # if not rows_CHH.empty:
    #    for chromosome in rows_CHH['chromosome'].unique():
    #        positions_par_chromosome[chromosome] = list(
    #            rows_CHH.loc[rows_CHH['chromosome'] == chromosome, ['strand', 'position']])
    total_count_CHH = sum(motif_counts.values())
    pourcentage_C_total = (total_count_CHH / nb_C) * 100
    print(" --- CHH STATISTICS --- ")
    print()
    print(
        f"The percentage of methylated cytosines (CHH type) among all cytosines in the genome:  {pourcentage_C_total: .2f} %")
    print()
    print(f" The number of occurrences of CHH methylation : {total_count_CHH}, which includes : ")
    for motif in CHH:
        pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
        print(f"    -> {motif} apprears {motif_counts[motif]} times")
        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
        print()
    return total_count_CHH


def plotting(counts, labels):
    fig, ax = plt.subplots(figsize=(8, 6), subplot_kw=dict(aspect="equal"))

    fig = plt.figure()
    ax = fig.add_subplot(111)
    wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')

    bbox_props = dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")

    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1) / 2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
        kw["arrowprops"].update({"connectionstyle": connectionstyle})
        ax.annotate(f"{labels[i]}: {counts[i]}", xy=(x, y), xytext=(1.35 * np.sign(x), 1.4 * y),
                    horizontalalignment=horizontalalignment, **kw)

    plt.title(f"Methylation context amoung all cytosines in the sample : {filebasename}")
    plt.show()

print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant  --------####")

# list of the three methylation contexts

CpG = ["CG"]
CHG = ["CCG", "CAG", "CTG"]
CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]

path = args.i
fasta = args.fasta
# nb_CpG = args.cpg
seuil = args.t
output_dir = args.o

# Running seqtk
result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True,
                        text=True)
if result.stdout.strip():
    nb_C = int(result.stdout.strip())
    print(f"Le nombre total de cytosine : {nb_C}")
else:
    print("Erreur : la commande n'a pas retourné de résultat.")

result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True,
                        text=True)
if result.stdout.strip():
    nb_CpG = int(result.stdout.strip())
    print(f"Le nombre total de CpG : {nb_CpG}")
else:
    print("Erreur : la commande n'a pas retourné de résultat.")

# read files
files = path + "/*.tsv"  # store the path to files
chromosomes = []
chromosome_results = {'cpg': 0, 'chg': 0, 'chh': 0}

for filename in glob.glob(files):
    filebasename = os.path.splitext(os.path.basename(filename))[0]
    print()
    print(f" Statistics for the file: {filebasename}")
    print()
    names = ["chromosome", "position", "strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified",
             "count_unmodified", "coverage", "modification_frequency", "k_mer"]
    call_mods = pd.read_csv(filename, sep='\t', names=names)

    df = pd.DataFrame(call_mods)

    couverture = df["coverage"].mean()
    couverture_int = int(couverture)
    print(f" The mean coverage: {couverture_int}")
    print()

    print(f" %%% -- The following statistics are performed on a dataset whose number of reads\n"
          f" in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
    print()
    methyl = df[df["count_modified"] >= seuil]
    treated_df = methyl[['chromosome', 'strand', 'position', 'k_mer']].copy()
    treated_df['k_mer'] = treated_df['k_mer'].str[2:]
    unique_chromosomes = treated_df['chromosome'].unique()
for chromosome in unique_chromosomes:
    df_chromosome = treated_df[treated_df['chromosome'] == chromosome]
    result_cpg = CpG_context(df_chromosome)
    result_chg = CHG_context(df_chromosome)
    result_chh = CHH_context(df_chromosome)
chromosomes.extend(unique_chromosomes)
chromosome_results = {'cpg': result_cpg, 'chg': result_chg, 'chh': result_chh}

counts = [result_cpg, result_chg, result_chh]
labels = ["CpG", "CHG", "CHH"]
plotting(counts, labels)