###-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
# Autor: Fadwa EL KHADDAR
# Lab : DIADE - IRD
# University : Montpellier - France

import os
import subprocess
import numpy as np
import sys
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import argparse
import re
from tkinter import *
import tkinter as tk
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
matplotlib.use('TkAgg')
# Arguments

parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
parser.add_argument('-fasta', type=str, help="Path to fasta file" )
parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
parser.add_argument('-o', type=str, help='Output directory ')

args = parser.parse_args()


# Function definition

def CpG_context():
    global CpG
    global nb_C
    global nb_CpG
    motif_counts = {}
    motif_sum = {}          # Dictionnary to store count all occurences
    motif_occurrences = {}  # Dictionnary to store count of each occurence
    positions_par_chromosome = {}
    for motif in CpG:
        motif_counts[motif] = 0
        motif_sum[motif] = 0
        motif_occurrences[motif] = []
    rows= []

# Loop through k_mer column of treated dataset

    for i, row in treated_df.iterrows():
        kmer = row['k_mer']
        first_two_letters = kmer[:2]
        if "CG" in first_two_letters:
            motif_count = first_two_letters.count("CG")
            motif_counts["CG"] += motif_count
            if motif_count > 0:
                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
                                   re.finditer("CG", first_two_letters)]  # Store the positions of the motif
                motif_occurrences["CG"].extend(motif_positions)
                rows.append(row)
    rows_CG_df = pd.DataFrame(rows)
    rows_CG = rows_CG_df.sort_values("chromosome")
    output_file = os.path.join(output_dir, filebasename + "_CG.csv")
    rows_CG.to_csv(output_file, index=False, sep='\t')
    #if not rows_CG.empty:
    #    for chromosome in rows_CG['chromosome'].unique():
    #        positions_par_chromosome[chromosome] = list(rows_CG.loc[rows_CG['chromosome'] == chromosome, ['strand', 'position']])
    total_count_CpG = sum(motif_counts.values())
    pourcentage_CpG = (total_count_CpG / nb_CpG) * 100
    pourcentage_C = (total_count_CpG / nb_C) * 100
    
    print(" --- CpG STATISTICS --- ")
    print()
    print(f" * The number of occurrences of CpG methylation : {total_count_CpG} ")
    print(
        f" * The percentage of CpG methylation among all CpGs in the genome : {pourcentage_CpG : .2f} %")
    print(
        f" * The percentage of methylated cytosines (CpG type) among all cytosines in the genome : {pourcentage_C : .2f} %")
    print()

    return total_count_CpG

def CHG_context():
    global CHG
    global nb_C
    global nb_CpG
    global output_dir
    global filebasename
    motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
    motif_sum = {} #
    motif_occurrences = {}  # create a dictionary to store occurrences of each motif
    positions_par_chromosome = {}

    for motif in CHG:
        motif_counts[motif] = 0
        motif_sum[motif] = 0
        motif_occurrences[motif] = []  # initialize an empty list for each motif
    rows = []
    for i, row in treated_df.iterrows():
        kmer= row['k_mer']
        for motif in CHG:
            motif_count = kmer.count(motif)
            motif_counts[motif] += motif_count
            if motif_count > 0:
                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in re.finditer(motif, kmer)]  # Store the positions of the motif
                motif_occurrences[motif].extend(motif_positions)
                rows.append(row)
    rows_CHG_df = pd.DataFrame(rows)
    rows_CHG = rows_CHG_df.sort_values("chromosome")
    #output_file = os.path.join(output_dir, filebasename + "_CHG.csv")
    #rows_CHG.to_csv(output_file, index=False, sep='\t')
    #if not rows_CHG.empty:
    #    for chromosome in rows_CHG['chromosome'].unique():
    #        positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, ['strand', 'position']])
    total_count_CHG = sum(motif_counts.values())
    pourcentage_C_total = (total_count_CHG / nb_C) * 100

    print(" --- CHG STATISTICS --- ")
    print()
    print(
        f"The percentage of methylated cytosines (CHG type) among all cytosines in the genome:  {pourcentage_C_total: .2f} %")
    print()
    print(f" The number of occurrences of CHG methylation : {total_count_CHG}, which includes : ")
    for motif in CHG:
        pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
        print(f"    -> {motif} apprears {motif_counts[motif]} times")
        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
        print()

    return total_count_CHG


def CHH_context():
    global CHH
    global nb_C
    global nb_CpG
    global output_dir
    global filebasename
    motif_counts = {}  # create dictionnary to store the times the motif appears in the sequence
    motif_sum = {}  #
    motif_occurrences = {}  # create a dictionary to store occurrences of each motif
    positions_par_chromosome = {}

    for motif in CHH:
        motif_counts[motif] = 0
        motif_sum[motif] = 0
        motif_occurrences[motif] = []  # initialize an empty list for each motif
    rows = []
    for i, row in treated_df.iterrows():
        kmer = row['k_mer']
        for motif in CHH:
            motif_count = kmer.count(motif)
            motif_counts[motif] += motif_count
            if motif_count > 0:
                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
                                   re.finditer(motif, kmer)]  # Store the positions of the motif
                motif_occurrences[motif].extend(motif_positions)
                rows.append(row)
    rows_CHH_df = pd.DataFrame(rows)
    rows_CHH = rows_CHH_df.sort_values("chromosome")
    #output_file = os.path.join(output_dir, filebasename + "_CHH.csv")
    #rows_CHH.to_csv(output_file, index=False, sep='\t')
    #if not rows_CHH.empty:
    #    for chromosome in rows_CHH['chromosome'].unique():
    #        positions_par_chromosome[chromosome] = list(
    #            rows_CHH.loc[rows_CHH['chromosome'] == chromosome, ['strand', 'position']])
    total_count_CHH = sum(motif_counts.values())
    pourcentage_C_total = (total_count_CHH / nb_C) * 100
    print(" --- CHH STATISTICS --- ")
    print()
    print(
        f"The percentage of methylated cytosines (CHH type) among all cytosines in the genome:  {pourcentage_C_total: .2f} %")
    print()
    print(f" The number of occurrences of CHH methylation : {total_count_CHH}, which includes : ")
    for motif in CHH:
        pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
        print(f"    -> {motif} apprears {motif_counts[motif]} times")
        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
        print()
    return total_count_CHH


def plotting():
    global nb_C
    CpG = CpG_context()
    CHG = CHG_context()
    CHH = CHH_context()


    fig, ax = plt.subplots(figsize=(10,6), subplot_kw=dict(aspect="equal"))
    nb_unmethylated = nb_C - CpG - CHG - CHH
    counts = [CpG, CHG, CHH, nb_unmethylated]

    labels = ["CpG", "CHG", "CHH", "unmethylated"]

    wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
    bbox_props= dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")

    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1) / 2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
        kw["arrowprops"].update({"connectionstyle": connectionstyle})
        ax.annotate(f"{labels[i]}: {counts[i]}", xy=(x, y), xytext=(1.35 * np.sign(x), 1.4 * y),
                    horizontalalignment=horizontalalignment, **kw)

    plt.title(f"Methylation context amoung all cytosines in the sample : {filebasename}")

    root = tk.Tk()
    root.title("Methylation Statistics")

    # Create a canvas to display the figure
    canvas = FigureCanvasTkAgg(fig, master=root)
    canvas.draw()
    canvas.get_tk_widget().pack()

    # Create a button to close the window
    button = tk.Button(master=root, text="Fermer", command=root.quit)
    button.pack()

    # Start the tkinter event loop
    tk.mainloop()

print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant  --------####")

# list of the three methylation contexts

CpG = ["CG"]
CHG = ["CCG", "CAG", "CTG"]
CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]

path = args.i
fasta = args.fasta
#nb_CpG = args.cpg
seuil = args.t
output_dir = args.o

# Running seqtk
result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True, text=True)
nb_C = int(result.stdout.strip())
print(f"Le nombre total de cytosine : {nb_C}")

result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True, text=True)
nb_CpG = int(result.stdout.strip())
print(f"Le nombre total de CpG : {nb_CpG}")



# read files

files = path + "/*.tsv"      # store the path to files
for filename in glob.glob(files): # to select all files present in that path
    filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
    print()
    print(f" Statistics for the file : {filebasename} ")
    print()
    names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"]    #Ajout des entêtes des colonnes
    call_mods = pd.read_csv(filename, sep='\t', names=names) # read files

    # DataFrame Creation

    df = pd.DataFrame(call_mods)

    # mean coverage :

    couverture= df["coverage"].mean()
    couverture_int =int(couverture)
    print(f" The mean coverage : {couverture_int}")
    print()

    # filtring dataset

    print(f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
    print()
    methyl= df[df["count_modified"] >= seuil]
    methylated=pd.DataFrame(methyl) # save only the methylated reads.

    #methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
    treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
    treated_df = pd.DataFrame(treated_df)
    treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer


    #CpG_context()
    #CHG_context()
    #CHH_context()