Update Statistics_Calculation_DeepSignalPlant

5f53a3ce · fadwael.khaddar_ird.fr · 4c12ef1d · 4c12ef1d · 5f53a3ce
Commit 5f53a3ce authored 1 year ago by fadwael.khaddar_ird.fr
--- a/Statistics_Calculation_DeepSignalPlant
+++ b/Statistics_Calculation_DeepSignalPlant
-
--- a/Statistics_Calculation_DeepSignalPlant.py
+++ b/Statistics_Calculation_DeepSignalPlant.py
+###-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
+# Autor: Fadwa EL KHADDAR
+# Lab : DIADE - IRD
+# University : Montpellier - France
+
+import os
+import subprocess
+import numpy as np
+import sys
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import argparse
+import re
+from tkinter import *
+import tkinter as tk
+from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
+matplotlib.use('TkAgg')
+# Arguments
+
+parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
+parser.add_argument('-fasta', type=str, help="Path to fasta file" )
+parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
+parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
+parser.add_argument('-o', type=str, help='Output directory ')
+
+args = parser.parse_args()
+
+
+# Function definition
+
+def CpG_context():
+    global CpG
+    global nb_C
+    global nb_CpG
+    motif_counts = {}
+    motif_sum = {}          # Dictionnary to store count all occurences
+    motif_occurrences = {}  # Dictionnary to store count of each occurence
+    positions_par_chromosome = {}
+    for motif in CpG:
+        motif_counts[motif] = 0
+        motif_sum[motif] = 0
+        motif_occurrences[motif] = []
+    rows= []
+
+# Loop through k_mer column of treated dataset
+
+    for i, row in treated_df.iterrows():
+        kmer = row['k_mer']
+        first_two_letters = kmer[:2]
+        if "CG" in first_two_letters:
+            motif_count = first_two_letters.count("CG")
+            motif_counts["CG"] += motif_count
+            if motif_count > 0:
+                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
+                                   re.finditer("CG", first_two_letters)]  # Store the positions of the motif
+                motif_occurrences["CG"].extend(motif_positions)
+                rows.append(row)
+    rows_CG_df = pd.DataFrame(rows)
+    rows_CG = rows_CG_df.sort_values("chromosome")
+    output_file = os.path.join(output_dir, filebasename + "_CG.csv")
+    rows_CG.to_csv(output_file, index=False, sep='\t')
+    #if not rows_CG.empty:
+    #    for chromosome in rows_CG['chromosome'].unique():
+    #        positions_par_chromosome[chromosome] = list(rows_CG.loc[rows_CG['chromosome'] == chromosome, ['strand', 'position']])
+    total_count_CpG = sum(motif_counts.values())
+    pourcentage_CpG = (total_count_CpG / nb_CpG) * 100
+    pourcentage_C = (total_count_CpG / nb_C) * 100
+    
+    print(" --- CpG STATISTICS --- ")
+    print()
+    print(f" * The number of occurrences of CpG methylation : {total_count_CpG} ")
+    print(
+        f" * The percentage of CpG methylation among all CpGs in the genome : {pourcentage_CpG : .2f} %")
+    print(
+        f" * The percentage of methylated cytosines (CpG type) among all cytosines in the genome : {pourcentage_C : .2f} %")
+    print()
+
+    return total_count_CpG
+
+def CHG_context():
+    global CHG
+    global nb_C
+    global nb_CpG
+    global output_dir
+    global filebasename
+    motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
+    motif_sum = {} #
+    motif_occurrences = {}  # create a dictionary to store occurrences of each motif
+    positions_par_chromosome = {}
+
+    for motif in CHG:
+        motif_counts[motif] = 0
+        motif_sum[motif] = 0
+        motif_occurrences[motif] = []  # initialize an empty list for each motif
+    rows = []
+    for i, row in treated_df.iterrows():
+        kmer= row['k_mer']
+        for motif in CHG:
+            motif_count = kmer.count(motif)
+            motif_counts[motif] += motif_count
+            if motif_count > 0:
+                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in re.finditer(motif, kmer)]  # Store the positions of the motif
+                motif_occurrences[motif].extend(motif_positions)
+                rows.append(row)
+    rows_CHG_df = pd.DataFrame(rows)
+    rows_CHG = rows_CHG_df.sort_values("chromosome")
+    #output_file = os.path.join(output_dir, filebasename + "_CHG.csv")
+    #rows_CHG.to_csv(output_file, index=False, sep='\t')
+    #if not rows_CHG.empty:
+    #    for chromosome in rows_CHG['chromosome'].unique():
+    #        positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, ['strand', 'position']])
+    total_count_CHG = sum(motif_counts.values())
+    pourcentage_C_total = (total_count_CHG / nb_C) * 100
+
+    print(" --- CHG STATISTICS --- ")
+    print()
+    print(
+        f"The percentage of methylated cytosines (CHG type) among all cytosines in the genome:  {pourcentage_C_total: .2f} %")
+    print()
+    print(f" The number of occurrences of CHG methylation : {total_count_CHG}, which includes : ")
+    for motif in CHG:
+        pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
+        print(f"    -> {motif} apprears {motif_counts[motif]} times")
+        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
+        print()
+
+    return total_count_CHG
+
+
+def CHH_context():
+    global CHH
+    global nb_C
+    global nb_CpG
+    global output_dir
+    global filebasename
+    motif_counts = {}  # create dictionnary to store the times the motif appears in the sequence
+    motif_sum = {}  #
+    motif_occurrences = {}  # create a dictionary to store occurrences of each motif
+    positions_par_chromosome = {}
+
+    for motif in CHH:
+        motif_counts[motif] = 0
+        motif_sum[motif] = 0
+        motif_occurrences[motif] = []  # initialize an empty list for each motif
+    rows = []
+    for i, row in treated_df.iterrows():
+        kmer = row['k_mer']
+        for motif in CHH:
+            motif_count = kmer.count(motif)
+            motif_counts[motif] += motif_count
+            if motif_count > 0:
+                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
+                                   re.finditer(motif, kmer)]  # Store the positions of the motif
+                motif_occurrences[motif].extend(motif_positions)
+                rows.append(row)
+    rows_CHH_df = pd.DataFrame(rows)
+    rows_CHH = rows_CHH_df.sort_values("chromosome")
+    #output_file = os.path.join(output_dir, filebasename + "_CHH.csv")
+    #rows_CHH.to_csv(output_file, index=False, sep='\t')
+    #if not rows_CHH.empty:
+    #    for chromosome in rows_CHH['chromosome'].unique():
+    #        positions_par_chromosome[chromosome] = list(
+    #            rows_CHH.loc[rows_CHH['chromosome'] == chromosome, ['strand', 'position']])
+    total_count_CHH = sum(motif_counts.values())
+    pourcentage_C_total = (total_count_CHH / nb_C) * 100
+    print(" --- CHH STATISTICS --- ")
+    print()
+    print(
+        f"The percentage of methylated cytosines (CHH type) among all cytosines in the genome:  {pourcentage_C_total: .2f} %")
+    print()
+    print(f" The number of occurrences of CHH methylation : {total_count_CHH}, which includes : ")
+    for motif in CHH:
+        pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
+        print(f"    -> {motif} apprears {motif_counts[motif]} times")
+        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
+        print()
+    return total_count_CHH
+
+
+def plotting():
+    global nb_C
+    CpG = CpG_context()
+    CHG = CHG_context()
+    CHH = CHH_context()
+
+
+    fig, ax = plt.subplots(figsize=(10,6), subplot_kw=dict(aspect="equal"))
+    nb_unmethylated = nb_C - CpG - CHG - CHH
+    counts = [CpG, CHG, CHH, nb_unmethylated]
+
+    labels = ["CpG", "CHG", "CHH", "unmethylated"]
+
+    wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
+    bbox_props= dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
+    kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")
+
+    for i, p in enumerate(wedges):
+        ang = (p.theta2 - p.theta1) / 2. + p.theta1
+        y = np.sin(np.deg2rad(ang))
+        x = np.cos(np.deg2rad(ang))
+        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
+        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
+        kw["arrowprops"].update({"connectionstyle": connectionstyle})
+        ax.annotate(f"{labels[i]}: {counts[i]}", xy=(x, y), xytext=(1.35 * np.sign(x), 1.4 * y),
+                    horizontalalignment=horizontalalignment, **kw)
+
+    plt.title(f"Methylation context amoung all cytosines in the sample : {filebasename}")
+
+    root = tk.Tk()
+    root.title("Methylation Statistics")
+
+    # Create a canvas to display the figure
+    canvas = FigureCanvasTkAgg(fig, master=root)
+    canvas.draw()
+    canvas.get_tk_widget().pack()
+
+    # Create a button to close the window
+    button = tk.Button(master=root, text="Fermer", command=root.quit)
+    button.pack()
+
+    # Start the tkinter event loop
+    tk.mainloop()
+
+print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant  --------####")
+
+# list of the three methylation contexts
+
+CpG = ["CG"]
+CHG = ["CCG", "CAG", "CTG"]
+CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]
+
+path = args.i
+fasta = args.fasta
+#nb_CpG = args.cpg
+seuil = args.t
+output_dir = args.o
+
+# Running seqtk
+result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True, text=True)
+nb_C = int(result.stdout.strip())
+print(f"Le nombre total de cytosine : {nb_C}")
+
+result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True, text=True)
+nb_CpG = int(result.stdout.strip())
+print(f"Le nombre total de CpG : {nb_CpG}")
+
+
+
+# read files
+
+files = path + "/*.tsv"      # store the path to files
+for filename in glob.glob(files): # to select all files present in that path
+    filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
+    print()
+    print(f" Statistics for the file : {filebasename} ")
+    print()
+    names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"]    #Ajout des entêtes des colonnes
+    call_mods = pd.read_csv(filename, sep='\t', names=names) # read files
+
+    # DataFrame Creation
+
+    df = pd.DataFrame(call_mods)
+
+    # mean coverage :
+
+    couverture= df["coverage"].mean()
+    couverture_int =int(couverture)
+    print(f" The mean coverage : {couverture_int}")
+    print()
+
+    # filtring dataset
+
+    print(f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
+    print()
+    methyl= df[df["count_modified"] >= seuil]
+    methylated=pd.DataFrame(methyl) # save only the methylated reads.
+
+    #methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
+    treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
+    treated_df = pd.DataFrame(treated_df)
+    treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
+
+
+    #CpG_context()
+    #CHG_context()
+    #CHH_context()