Modifications

377f1657 · fadwael.khaddar_ird.fr · 4d03aeb6 · 377f1657
Commit 377f1657 authored 1 year ago by fadwael.khaddar_ird.fr
--- a/Statistics_methylation.py
+++ b/Statistics_methylation.py
@@ -9,6 +9,7 @@ import  glob
 import sys
 import matplotlib as plt
 import argparse
+import re

 # Arguments

@@ -40,19 +41,26 @@ def CpG_context():

 # Loop through k_mer column of treated dataset

-    for i, first_two_letters in zip(range(len(treated_df["k_mer"])), treated_df["k_mer"].str.slice(0, 2)):
-        for motif in CpG:
-            motif_count = first_two_letters.count(motif)
-            motif_counts[motif] += motif_count
+    for i, row in treated_df.iterrows():
+        kmer = row['k_mer']
+        first_two_letters = kmer[:2]
+        if "CG" in first_two_letters:
+            motif_count = first_two_letters.count("CG")
+            motif_counts["CG"] += motif_count
            if motif_count > 0:
-                motif_occurrences[motif].append(i)
-                row = treated_df.iloc[i]
+                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
+                                   re.finditer("CG", first_two_letters)]  # Store the positions of the motif
+                motif_occurrences["CG"].extend(motif_positions)
                rows.append(row)
-    rows_CpG_df = pd.DataFrame(rows)
-    rows_CpG = rows_CpG_df.sort_values("position")
-    if not rows_CpG.empty:
-        for chromosome in rows_CpG['chromosome'].unique():
-            positions_par_chromosome[chromosome] = list(rows_CpG.loc[rows_CpG['chromosome'] == chromosome, 'position'])
+
+    rows_CG_df = pd.DataFrame(rows)
+    rows_CG = rows_CG_df.sort_values("chromosome")
+    print(rows_CG)
+    output_file = os.path.join(output_dir, filebasename + "_CG.csv")
+    rows_CG.to_csv(output_file, index=False, sep='\t')
+    if not rows_CG.empty:
+        for chromosome in rows_CG['chromosome'].unique():
+            positions_par_chromosome[chromosome] = list(rows_CG.loc[rows_CG['chromosome'] == chromosome, ['strand', 'position']])
    total_count = sum(motif_counts.values())
    pourcentage_CpG = (total_count / nb_CpG) * 100
    pourcentage_C = (total_count / nb_C) * 100
@@ -72,6 +80,8 @@ def CHG_context():
    global CHG
    global nb_C
    global nb_CpG
+    global output_dir
+    global filebasename
    motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
    motif_sum = {} #
    motif_occurrences = {}  # create a dictionary to store occurrences of each motif
@@ -82,19 +92,22 @@ def CHG_context():
        motif_sum[motif] = 0
        motif_occurrences[motif] = []  # initialize an empty list for each motif
    rows = []
-    for i, kmer in enumerate(treated_df["k_mer"]):
+    for i, row in treated_df.iterrows():
+        kmer= row['k_mer']
        for motif in CHG:
            motif_count = kmer.count(motif)
            motif_counts[motif] += motif_count
            if motif_count > 0 and motif == "CCG":
-                motif_occurrences[motif].append(i)
-                row = treated_df.iloc[i]    # Store the raw
+                motif_positions = [(row['chromosome'], row['strand'], pos) for pos in re.finditer(motif, kmer)]  # Store the positions of the motif
+                motif_occurrences[motif].extend(motif_positions)
                rows.append(row)
    rows_CHG_df = pd.DataFrame(rows)
-    rows_CHG = rows_CHG_df.sort_values("position")
+    rows_CHG = rows_CHG_df.sort_values("chromosome")
+    output_file = os.path.join(output_dir, filebasename + "_CCG.csv")
+    rows_CHG.to_csv(output_file, index=False, sep='\t')
    if not rows_CHG.empty:
        for chromosome in rows_CHG['chromosome'].unique():
-            positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, 'position'])
+            positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, ['strand', 'position']])
    total_count = sum(motif_counts.values())
    pourcentage_C_total = (total_count / nb_C) * 100
    print(" --- CHG STATISTICS --- ")
@@ -142,23 +155,8 @@ def CHH_context():
        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
        print()

-def successives_CpG_CCG():
-    global filebasename
-    global output_dir
-    rows_CpG = CpG_context()
-    rows_CHG = CHG_context()
-    results = []
-    for key in set(rows_CpG.keys()) & set(rows_CHG.keys()):
-        positions_CpG = sorted(rows_CpG[key])
-        positions_CHG = sorted(rows_CHG[key])
-        for i in range(len(positions_CpG)):
-            for j in range(len(positions_CHG)):
-                if abs(positions_CpG[i] - positions_CHG[j]) == 1:
-                    results.append({"Chromosome": key, "CCG position": positions_CHG[j], "CpG position": positions_CpG[i]})
-                    #print( " In " + key + " CCG methylation at " + str(positions_CHG[j]) + " and CpG methylation at " + str(positions_CpG[i]) + " are successive ")
-    df = pd.DataFrame(results, columns=["Chromosome", "CCG position", "CpG position"])
-    output_file = os.path.join(output_dir, filebasename + "_successive_CpG_CCG.csv")
-    df.to_csv(output_file, index=False, sep ='\t')
+def hemi_methylated_cytosine():
+    


 print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant  --------####")
@@ -205,12 +203,13 @@ for filename in glob.glob(files): # to select all files present in that path
    methylated=pd.DataFrame(methyl) # save only the methylated reads.

    #methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
-    treated_df = methylated[['chromosome', 'position', 'k_mer']] #filter dataset
+    treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
    treated_df = pd.DataFrame(treated_df)
    treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer

-    #CpG_context()
-    #CHG_context()
-    successives_CpG_CCG()
-    #CHH_context()
+
+    CpG_context()
+    CHG_context()
+    #successives_CpG_CCG()
+    CHH_context()