Ajout de l'analyse par Chunk

24983914 · fadwael.khaddar_ird.fr · 159e786e · 24983914
Commit 24983914 authored 1 year ago by fadwael.khaddar_ird.fr
--- a/Statistics_methylation_Oryza.py
+++ b/Statistics_methylation_Oryza.py
-####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
+###-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
 # Autor: Fadwa EL KHADDAR
 # Lab : DIADE - IRD
 # University : Montpellier - France
@@ -6,8 +6,6 @@
 import os
 import subprocess
 import numpy as np
-import pandas as pd
-import glob
 import sys
 import matplotlib
 import matplotlib.pyplot as plt
@@ -16,37 +14,43 @@ import argparse
 import re
 from tkinter import *
 import tkinter as tk
+import glob
+import pandas as pd
 from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
-
 matplotlib.use('TkAgg')
 # Arguments

 parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
-parser.add_argument('-fasta', type=str, help="Path to fasta file")
+parser.add_argument('-fasta', type=str, help="Path to fasta file" )
 parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
 parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
 parser.add_argument('-o', type=str, help='Output directory ')

 args = parser.parse_args()

+path = args.i
+fasta = args.fasta
+seuil = args.t
+output_dir = args.o
+

 # Function definition

-def CpG_context(df_chromosome):
+def CpG_context():
    global CpG
    global nb_C
    global nb_CpG
    motif_counts = {}
-    motif_sum = {}  # Dictionnary to store count all occurences
+    motif_sum = {}          # Dictionnary to store count all occurences
    motif_occurrences = {}  # Dictionnary to store count of each occurence
    positions_par_chromosome = {}
    for motif in CpG:
        motif_counts[motif] = 0
        motif_sum[motif] = 0
        motif_occurrences[motif] = []
-    rows = []
+    rows= []

-    # Loop through k_mer column of treated dataset
+# Loop through k_mer column of treated dataset

    for i, row in treated_df.iterrows():
        kmer = row['k_mer']
@@ -59,11 +63,11 @@ def CpG_context(df_chromosome):
                                   re.finditer("CG", first_two_letters)]  # Store the positions of the motif
                motif_occurrences["CG"].extend(motif_positions)
                rows.append(row)
-    rows_CG_df = pd.DataFrame(rows)
-    rows_CG = rows_CG_df.sort_values("chromosome")
-    output_file = os.path.join(output_dir, filebasename + "_CG.csv")
-    rows_CG.to_csv(output_file, index=False, sep='\t')
-    # if not rows_CG.empty:
+    #rows_CG_df = pd.DataFrame(rows)
+    #rows_CG = rows_CG_df.sort_values("chromosome")
+    #output_file = os.path.join(output_dir, filebasename + "_CG.csv")
+    #rows_CG.to_csv(output_file, index=False, sep='\t')
+    #if not rows_CG.empty:
    #    for chromosome in rows_CG['chromosome'].unique():
    #        positions_par_chromosome[chromosome] = list(rows_CG.loc[rows_CG['chromosome'] == chromosome, ['strand', 'position']])
    total_count_CpG = sum(motif_counts.values())
@@ -81,15 +85,14 @@ def CpG_context(df_chromosome):

    return total_count_CpG

-
-def CHG_context(df_chromosome):
+def CHG_context():
    global CHG
    global nb_C
    global nb_CpG
    global output_dir
    global filebasename
-    motif_counts = {}  # create dictionnary to store the times the motif appears in the sequence
-    motif_sum = {}  #
+    motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
+    motif_sum = {} #
    motif_occurrences = {}  # create a dictionary to store occurrences of each motif
    positions_par_chromosome = {}

@@ -99,20 +102,19 @@ def CHG_context(df_chromosome):
        motif_occurrences[motif] = []  # initialize an empty list for each motif
    rows = []
    for i, row in treated_df.iterrows():
-        kmer = row['k_mer']
+        kmer= row['k_mer']
        for motif in CHG:
            motif_count = kmer.count(motif)
            motif_counts[motif] += motif_count
            if motif_count > 0:
-                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
-                                   re.finditer(motif, kmer)]  # Store the positions of the motif
+                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in re.finditer(motif, kmer)]  # Store the positions of the motif
                motif_occurrences[motif].extend(motif_positions)
                rows.append(row)
-    rows_CHG_df = pd.DataFrame(rows)
-    rows_CHG = rows_CHG_df.sort_values("chromosome")
-    output_file = os.path.join(output_dir, filebasename + "_CHG.csv")
-    rows_CHG.to_csv(output_file, index=False, sep='\t')
-    # if not rows_CHG.empty:
+    #rows_CHG_df = pd.DataFrame(rows)
+    #rows_CHG = rows_CHG_df.sort_values("chromosome")
+    #output_file = os.path.join(output_dir, filebasename + "_CHG.csv")
+    #rows_CHG.to_csv(output_file, index=False, sep='\t')
+    #if not rows_CHG.empty:
    #    for chromosome in rows_CHG['chromosome'].unique():
    #        positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, ['strand', 'position']])
    total_count_CHG = sum(motif_counts.values())
@@ -133,7 +135,7 @@ def CHG_context(df_chromosome):
    return total_count_CHG


-def CHH_context(df_chromosome):
+def CHH_context():
    global CHH
    global nb_C
    global nb_CpG
@@ -159,11 +161,11 @@ def CHH_context(df_chromosome):
                                   re.finditer(motif, kmer)]  # Store the positions of the motif
                motif_occurrences[motif].extend(motif_positions)
                rows.append(row)
-    rows_CHH_df = pd.DataFrame(rows)
-    rows_CHH = rows_CHH_df.sort_values('chromosome')
-    output_file = os.path.join(output_dir, filebasename + "_CHH.csv")
-    rows_CHH.to_csv(output_file, index=False, sep='\t')
-    # if not rows_CHH.empty:
+    #rows_CHH_df = pd.DataFrame(rows)
+    #rows_CHH = rows_CHH_df.sort_values("chromosome")
+    #output_file = os.path.join(output_dir, filebasename + "_CHH.csv")
+    #rows_CHH.to_csv(output_file, index=False, sep='\t')
+    #if not rows_CHH.empty:
    #    for chromosome in rows_CHH['chromosome'].unique():
    #        positions_par_chromosome[chromosome] = list(
    #            rows_CHH.loc[rows_CHH['chromosome'] == chromosome, ['strand', 'position']])
@@ -183,15 +185,21 @@ def CHH_context(df_chromosome):
    return total_count_CHH


+def plotting(total_count_CpG_list, total_count_CHG_list, total_count_CHH_list):
+    global nb_C
+    CpG = sum(total_count_CpG_list)
+    CHG = sum(total_count_CHG_list)
+    CHH = sum(total_count_CHH_list)

-def plotting(counts, labels):
-    fig, ax = plt.subplots(figsize=(8, 6), subplot_kw=dict(aspect="equal"))

-    fig = plt.figure()
-    ax = fig.add_subplot(111)
-    wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
+    fig, ax = plt.subplots(figsize=(10,6), subplot_kw=dict(aspect="equal"))
+    nb_unmethylated = nb_C - CpG - CHG - CHH
+    counts = [CpG, CHG , CHH , nb_unmethylated]

-    bbox_props = dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
+    labels = ["CpG", "CHG", "CHH", "unmethylated"]
+
+    wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
+    bbox_props= dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")

    for i, p in enumerate(wedges):
@@ -205,7 +213,21 @@ def plotting(counts, labels):
                    horizontalalignment=horizontalalignment, **kw)

    plt.title(f"Methylation context amoung all cytosines in the sample : {filebasename}")
-    plt.show()
+
+    root = tk.Tk()
+    root.title("Methylation Statistics")
+
+    # Create a canvas to display the figure
+    canvas = FigureCanvasTkAgg(fig, master=root)
+    canvas.draw()
+    canvas.get_tk_widget().pack()
+
+    # Create a button to close the window
+    button = tk.Button(master=root, text="Fermer", command=root.quit)
+    button.pack()
+
+    # Start the tkinter event loop
+    tk.mainloop()

 print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant  --------####")

@@ -215,66 +237,66 @@ CpG = ["CG"]
 CHG = ["CCG", "CAG", "CTG"]
 CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]

-path = args.i
-fasta = args.fasta
-# nb_CpG = args.cpg
-seuil = args.t
-output_dir = args.o

 # Running seqtk
-result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True,
-                        text=True)
-if result.stdout.strip():
-    nb_C = int(result.stdout.strip())
-    print(f"Le nombre total de cytosine : {nb_C}")
-else:
-    print("Erreur : la commande n'a pas retourné de résultat.")
-
-result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True,
-                        text=True)
-if result.stdout.strip():
-    nb_CpG = int(result.stdout.strip())
-    print(f"Le nombre total de CpG : {nb_CpG}")
-else:
-    print("Erreur : la commande n'a pas retourné de résultat.")
+result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True, text=True)
+nb_C = int(result.stdout.strip())
+print(f"Le nombre total de cytosine : {nb_C}")

-# read files
-files = path + "/*.tsv"  # store the path to files
-chromosomes = []
-chromosome_results = {'cpg': 0, 'chg': 0, 'chh': 0}
+result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True, text=True)
+nb_CpG = int(result.stdout.strip())
+print(f"Le nombre total de CpG : {nb_CpG}")

-for filename in glob.glob(files):
-    filebasename = os.path.splitext(os.path.basename(filename))[0]
-    print()
-    print(f" Statistics for the file: {filebasename}")
-    print()
-    names = ["chromosome", "position", "strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified",
-             "count_unmodified", "coverage", "modification_frequency", "k_mer"]
-    call_mods = pd.read_csv(filename, sep='\t', names=names)

-    df = pd.DataFrame(call_mods)
+# read files
+total_count_CpG_list = []
+total_count_CHH_list = []
+total_count_CHG_list = []

-    couverture = df["coverage"].mean()
-    couverture_int = int(couverture)
-    print(f" The mean coverage: {couverture_int}")
-    print()
+files = path + "/*.tsv"      # store the path to files
+print(
+    f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")

-    print(f" %%% -- The following statistics are performed on a dataset whose number of reads\n"
-          f" in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
+for filename in glob.glob(files): # to select all files present in that path
+    filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
    print()
-    methyl = df[df["count_modified"] >= seuil]
-    treated_df = methyl[['chromosome', 'strand', 'position', 'k_mer']].copy()
-    treated_df['k_mer'] = treated_df['k_mer'].str[2:]
-    unique_chromosomes = treated_df['chromosome'].unique()
-for chromosome in unique_chromosomes:
-    df_chromosome = treated_df[treated_df['chromosome'] == chromosome]
-    result_cpg = CpG_context(df_chromosome)
-    result_chg = CHG_context(df_chromosome)
-    result_chh = CHH_context(df_chromosome)
-chromosomes.extend(unique_chromosomes)
-chromosome_results = {'cpg': result_cpg, 'chg': result_chg, 'chh': result_chh}
-
-counts = [result_cpg, result_chg, result_chh]
-labels = ["CpG", "CHG", "CHH"]
-plotting(counts, labels)
-
+    print(f" Statistics for the file : {filebasename} ")
+    print()
+    chunksize= 100000 # You can modify this value to fit your chunksize
+    names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"]    #Ajout des entêtes des colonnes
+    call_mods = pd.read_csv(filename, sep='\t', names=names, chunksize=chunksize) # read files
+    chunk_number = 1
+    for chunk in call_mods:
+        # DataFrame Creation
+        df = pd.DataFrame(chunk)
+
+    # mean coverage :
+        #couverture= df["coverage"].mean()
+        #couverture_int =int(couverture)
+        #print(f" The mean coverage : {couverture_int}")
+        #print()
+        print()
+        methyl= df[df["count_modified"] >= seuil]
+        print(f"Processing chunk {chunk_number}")
+        methylated=pd.DataFrame(methyl) # save only the methylated reads.
+
+        #methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
+        treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
+        treated_df = pd.DataFrame(treated_df)
+        treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
+        total_count_CpG_chunk = CpG_context()
+        total_count_CHG_chunk = CHG_context()
+        total_count_CHH_chunk = CHH_context()
+        total_count_CpG_list.append(total_count_CpG_chunk)
+        total_count_CHG_list.append(total_count_CHG_chunk)
+        total_count_CHH_list.append(total_count_CHH_chunk)
+        chunk_number += 1
+total_count_CpG_chunk = sum(total_count_CpG_list)
+total_count_CHG_chunk = sum(total_count_CHG_list)
+total_count_CHH_chunk = sum(total_count_CHH_list)
+
+
+#CpG_context()
+#CHG_context()
+#CHH_context()
+plotting(total_count_CpG_list, total_count_CHG_list, total_count_CHH_list)