Automatisation de script

4d03aeb6 · fadwael.khaddar_ird.fr · cfa17e14 · 4d03aeb6
Commit 4d03aeb6 authored 1 year ago by fadwael.khaddar_ird.fr
--- a/Statistics_methylation.py
+++ b/Statistics_methylation.py
-####-------- CALCUL DES STATISTIQUES de méthylation à partir de fichier TSV généré par DeepSignal-Plant --------####
+####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
 # Autor: Fadwa EL KHADDAR
 # Lab : DIADE - IRD
 # University : Montpellier - France

+import os
 import pandas as pd
 import  glob
 import sys
 import matplotlib as plt
+import argparse
+
+# Arguments
+
+parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
+parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
+parser.add_argument('-c', type=int, help="Number of cytosines present in the genome" )
+parser.add_argument('-cpg', type=int, help='Number of CpG present in the genome')
+parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
+parser.add_argument('-o', type=str, help='Output directory ')
+
+args = parser.parse_args()
+
+
 # Function definition

 def CpG_context():
    global CpG
    global nb_C
    global nb_CpG
-    positions_CpG = []
-    motif_counts = {}  # Dictionnary to store count of each occurence
+    motif_counts = {}
+    motif_sum = {}          # Dictionnary to store count all occurences
+    motif_occurrences = {}  # Dictionnary to store count of each occurence
+    positions_par_chromosome = {}
    for motif in CpG:
        motif_counts[motif] = 0
+        motif_sum[motif] = 0
+        motif_occurrences[motif] = []
+    rows= []

-    # Loop through k_mer column of methylated dataset
+# Loop through k_mer column of treated dataset

-    for kmer, position in zip(methylated["k_mer"][2:], methylated["position"]):
-        sub_kmer = kmer[2:]
-        sub_kmer_str = "".join(sub_kmer)
-
-        # Comptage des occurences de chaque motif dans sub_kmer
+    for i, first_two_letters in zip(range(len(treated_df["k_mer"])), treated_df["k_mer"].str.slice(0, 2)):
        for motif in CpG:
-            motif_count = sub_kmer_str.count(motif)
+            motif_count = first_two_letters.count(motif)
            motif_counts[motif] += motif_count
            if motif_count > 0:
-                positions_CpG.append(position)
-            total_count = sum(motif_counts.values())
-            pourcentage_CpG = (total_count / nb_CpG) * 100
-            pourcentage_C = (total_count / nb_C) * 100
-    print(" --- Statistiques de CpG --- ")
+                motif_occurrences[motif].append(i)
+                row = treated_df.iloc[i]
+                rows.append(row)
+    rows_CpG_df = pd.DataFrame(rows)
+    rows_CpG = rows_CpG_df.sort_values("position")
+    if not rows_CpG.empty:
+        for chromosome in rows_CpG['chromosome'].unique():
+            positions_par_chromosome[chromosome] = list(rows_CpG.loc[rows_CpG['chromosome'] == chromosome, 'position'])
+    total_count = sum(motif_counts.values())
+    pourcentage_CpG = (total_count / nb_CpG) * 100
+    pourcentage_C = (total_count / nb_C) * 100
+
+
+    print(" --- CpG STATISTICS --- ")
    print()
-    print(f" * Le nombre d'occurence de la méthylation CpG : {total_count} ")
+    print(f" * The number of occurrences of CpG methylation : {total_count} ")
    print(
-        f" * Le pourcentage de la méthylation CpG parmi tous les CpG du génome de référence : {pourcentage_CpG : .2f} %")
+        f" * The percentage of CpG methylation among all CpGs in the genome : {pourcentage_CpG : .2f} %")
    print(
-        f" * Le pourcentage des cytosines méthylés (type CpG) parmi tous les cytosines du génome de référence : {pourcentage_C : .2f} %")
+        f" * The percentage of methylated cytosines (CpG type) among all cytosines in the genome : {pourcentage_C : .2f} %")
    print()
-    return positions_CpG
+    return positions_par_chromosome

 def CHG_context():
    global CHG
    global nb_C
    global nb_CpG
-    motif_counts = {}
-    motif_sum = {}
+    motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
+    motif_sum = {} #
    motif_occurrences = {}  # create a dictionary to store occurrences of each motif
+    positions_par_chromosome = {}
+
    for motif in CHG:
        motif_counts[motif] = 0
        motif_sum[motif] = 0
@@ -61,24 +88,27 @@ def CHG_context():
            motif_counts[motif] += motif_count
            if motif_count > 0 and motif == "CCG":
                motif_occurrences[motif].append(i)
-                row = treated_df.iloc[i]
+                row = treated_df.iloc[i]    # Store the raw
                rows.append(row)
-    if len(rows) != 0:
-        rows_df = pd.DataFrame(rows)
+    rows_CHG_df = pd.DataFrame(rows)
+    rows_CHG = rows_CHG_df.sort_values("position")
+    if not rows_CHG.empty:
+        for chromosome in rows_CHG['chromosome'].unique():
+            positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, 'position'])
    total_count = sum(motif_counts.values())
    pourcentage_C_total = (total_count / nb_C) * 100
-    print(" --- Statistiques de CHG --- ")
+    print(" --- CHG STATISTICS --- ")
    print()
    print(
-        f"Le pourcentage des cytosines méthylés (type CHG) parmi tous les cytosines du génome de référence {pourcentage_C_total: .2f} %")
+        f"The percentage of methylated cytosines (CHG type) among all cytosines in the genome:  {pourcentage_C_total: .2f} %")
    print()
-    print(f"Le nombre d'occurence de la méthylation CHG : {total_count}, dont : ")
+    print(f" The number of occurrences of CHG methylation : {total_count}, which includes : ")
    for motif in CHG:
        pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
-        print(f"    -> {motif} apparaît {motif_counts[motif]} fois")
-        print(f"       Le pourcentage des cytosines méthylés (type {motif}) est {pourcentage_C_motif: .2f}%")
+        print(f"    -> {motif} apprears {motif_counts[motif]} times")
+        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
        print()
-    return rows_df
+    return positions_par_chromosome


 def CHH_context():
@@ -90,10 +120,10 @@ def CHH_context():
        motif_counts[motif] = 0
        motif_sum[motif] = 0

-    # Boucler à travers chaque k_mer dans methylated dataset
+# Loop through k_mer column of treated dataset

-    for kmer in methylated["k_mer"][2:]:
-        sub_kmer = kmer[2:]
+    for kmer in treated_df["k_mer"]:
+        sub_kmer = kmer[0:]
        sub_kmer_str = "".join(sub_kmer)
        for motif in CHH:
            motif_count = sub_kmer_str.count(motif)
@@ -101,82 +131,86 @@ def CHH_context():
            motif_sum[motif] += motif_count
    total_count = sum(motif_counts.values())
    pourcentage_C_total = (total_count / nb_C) * 100
-    print(" --- Statistiques de CHH --- ")
+    print(" --- CHH STATISTICS --- ")
    print()
-    print(f"Le pourcentage des cytosines méthylés (type CHH) parmi tous les cytosines du génome de référence {pourcentage_C_total: .2f} %")
+    print(f"The percentage of methylated cytosines (CHH type) among all cytosines in the genome: {pourcentage_C_total: .2f} %")
    print()
-    print(f"Le nombre d'occurence de la méthylation CHH : {total_count}, dont : ")
+    print(f"Le nombre d'occurence de la méthylation CHH : {total_count}, which includes : ")
    for motif in CHH :
        pourcentage_C_motif = (motif_sum[motif] / nb_C) * 100
-        print(f"    -> {motif} apparaît {motif_counts[motif]} fois")
-        print(f"       Le pourcentage des cytosines méthylés (type {motif}) est {pourcentage_C_motif: .2f}%")
+        print(f"    -> {motif} appears {motif_counts[motif]} times")
+        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
        print()

+def successives_CpG_CCG():
+    global filebasename
+    global output_dir
+    rows_CpG = CpG_context()
+    rows_CHG = CHG_context()
+    results = []
+    for key in set(rows_CpG.keys()) & set(rows_CHG.keys()):
+        positions_CpG = sorted(rows_CpG[key])
+        positions_CHG = sorted(rows_CHG[key])
+        for i in range(len(positions_CpG)):
+            for j in range(len(positions_CHG)):
+                if abs(positions_CpG[i] - positions_CHG[j]) == 1:
+                    results.append({"Chromosome": key, "CCG position": positions_CHG[j], "CpG position": positions_CpG[i]})
+                    #print( " In " + key + " CCG methylation at " + str(positions_CHG[j]) + " and CpG methylation at " + str(positions_CpG[i]) + " are successive ")
+    df = pd.DataFrame(results, columns=["Chromosome", "CCG position", "CpG position"])
+    output_file = os.path.join(output_dir, filebasename + "_successive_CpG_CCG.csv")
+    df.to_csv(output_file, index=False, sep ='\t')
+
+
+print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant  --------####")
+
+# list of the three methylation contexts

-def CpG_CCG_successive():
-    positions_CHG = CHG_context(methylated)
-    positions_CpG = CpG_context(methylated)
-    successive_values = []
-    for i in range(len(positions_CHG)):
-        if i < len(positions_CpG):
-            if abs(positions_CHG[i] - positions_CpG[i]) == 1:
-                successive_values.append((positions_CHG[i], positions_CpG[i]))
-        if i > 0:
-            if abs(positions_CHG[i] - positions_CHG[i - 1]) == 1:
-                successive_values.append((positions_CHG[i - 1], positions_CHG[i]))
-            if abs(positions_CpG[i - 1] - positions_CpG[i]) == 1:
-                successive_values.append((positions_CpG[i - 1], positions_CpG[i]))
-    successive_values = sorted(successive_values, key=lambda x: x[0])
-    for x, y in successive_values:
-        print("Successive positions: CCG = {}, CpG = {}".format(x, y))
-
-
-print("####-------- CALCUL DES STATISTIQUES DE METHYLATION A PARTIR DE FICHIER TSV GENERE PAR DeepSignal-Plant --------####")
-print()
-# Utilisation de glob pour pouvoir tourner le script sur tous les fichiers tsv.
-
-path = r"/home/fadwa/Téléchargements/test"
-filenames = glob.glob(path + '/*.tsv')
-
-
-# Déclaration des trois contexte de méthylation
 CpG = ["CG"]
 CHG = ["CCG", "CAG", "CTG"]
 CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]
-seuil = 5
-nb_C = 21551308
-nb_CpG= 5604558

+path = args.i
+nb_C = args.c
+nb_CpG = args.cpg
+seuil = args.t
+output_dir = args.o

-# Lecture de fichier
+# read files

-for file in filenames:
-    print(f"Statistiques pour le fichier {file} ")
-    names = ["Chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"]    #Ajout des entêtes des colonnes
-    call_mods = pd.read_csv(file, sep='\t', names=names)
+files = path + "/*.tsv"      # store the path to files
+for filename in glob.glob(files): # to select all files present in that path
+    filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
+    print()
+    print(f" Statistics for the file : {filebasename} ")
+    print()
+    names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"]    #Ajout des entêtes des colonnes
+    call_mods = pd.read_csv(filename, sep='\t', names=names) # read files

-    # Création du DataFrame
+    # DataFrame Creation

    df = pd.DataFrame(call_mods)

-    # Nombre des reads méthylés :
+    # mean coverage :

    couverture= df["coverage"].mean()
    couverture_int =int(couverture)
-    print(f"La couverture moyenne est : {couverture_int}")
+    print(f" The mean coverage : {couverture_int}")
    print()

-    # Extraction des reads méthylés en fonction avec un seuil sup ou égal à 5 reads mappés.
-    print(" %%% -- Les statistiques suivantes sont effectuées sur un dataset \n dont le nombre de reads dans lesquels la base ciblée est considérée \n comme modifiée est supérieur ou égal à 5 -- %%%")
+    # filtring dataset
+
+    print(f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
    print()
    methyl= df[df["count_modified"] >= seuil]
-    methylated=pd.DataFrame(methyl) #Création du dataframe methylated contenant
+    methylated=pd.DataFrame(methyl) # save only the methylated reads.
+
    #methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
-    treated_df = methylated[['position', 'k_mer']]
+    treated_df = methylated[['chromosome', 'position', 'k_mer']] #filter dataset
    treated_df = pd.DataFrame(treated_df)
-    treated_df['k_mer'] = treated_df['k_mer'].str[2:]
+    treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
+
+    #CpG_context()
+    #CHG_context()
+    successives_CpG_CCG()
+    #CHH_context()

-    CpG_context()
-    CHG_context()
-    CHH_context()
-    #CpG_CCG_successive()