Skip to content
Snippets Groups Projects
Commit 4d03aeb6 authored by fadwael.khaddar_ird.fr's avatar fadwael.khaddar_ird.fr
Browse files

Automatisation de script

parent cfa17e14
No related branches found
No related tags found
No related merge requests found
####-------- CALCUL DES STATISTIQUES de méthylation à partir de fichier TSV généré par DeepSignal-Plant --------####
####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
# Autor: Fadwa EL KHADDAR
# Lab : DIADE - IRD
# University : Montpellier - France
import os
import pandas as pd
import glob
import sys
import matplotlib as plt
import argparse
# Arguments
parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
parser.add_argument('-c', type=int, help="Number of cytosines present in the genome" )
parser.add_argument('-cpg', type=int, help='Number of CpG present in the genome')
parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
parser.add_argument('-o', type=str, help='Output directory ')
args = parser.parse_args()
# Function definition
def CpG_context():
global CpG
global nb_C
global nb_CpG
positions_CpG = []
motif_counts = {} # Dictionnary to store count of each occurence
motif_counts = {}
motif_sum = {} # Dictionnary to store count all occurences
motif_occurrences = {} # Dictionnary to store count of each occurence
positions_par_chromosome = {}
for motif in CpG:
motif_counts[motif] = 0
motif_sum[motif] = 0
motif_occurrences[motif] = []
rows= []
# Loop through k_mer column of methylated dataset
# Loop through k_mer column of treated dataset
for kmer, position in zip(methylated["k_mer"][2:], methylated["position"]):
sub_kmer = kmer[2:]
sub_kmer_str = "".join(sub_kmer)
# Comptage des occurences de chaque motif dans sub_kmer
for i, first_two_letters in zip(range(len(treated_df["k_mer"])), treated_df["k_mer"].str.slice(0, 2)):
for motif in CpG:
motif_count = sub_kmer_str.count(motif)
motif_count = first_two_letters.count(motif)
motif_counts[motif] += motif_count
if motif_count > 0:
positions_CpG.append(position)
total_count = sum(motif_counts.values())
pourcentage_CpG = (total_count / nb_CpG) * 100
pourcentage_C = (total_count / nb_C) * 100
print(" --- Statistiques de CpG --- ")
motif_occurrences[motif].append(i)
row = treated_df.iloc[i]
rows.append(row)
rows_CpG_df = pd.DataFrame(rows)
rows_CpG = rows_CpG_df.sort_values("position")
if not rows_CpG.empty:
for chromosome in rows_CpG['chromosome'].unique():
positions_par_chromosome[chromosome] = list(rows_CpG.loc[rows_CpG['chromosome'] == chromosome, 'position'])
total_count = sum(motif_counts.values())
pourcentage_CpG = (total_count / nb_CpG) * 100
pourcentage_C = (total_count / nb_C) * 100
print(" --- CpG STATISTICS --- ")
print()
print(f" * Le nombre d'occurence de la méthylation CpG : {total_count} ")
print(f" * The number of occurrences of CpG methylation : {total_count} ")
print(
f" * Le pourcentage de la méthylation CpG parmi tous les CpG du génome de référence : {pourcentage_CpG : .2f} %")
f" * The percentage of CpG methylation among all CpGs in the genome : {pourcentage_CpG : .2f} %")
print(
f" * Le pourcentage des cytosines méthylés (type CpG) parmi tous les cytosines du génome de référence : {pourcentage_C : .2f} %")
f" * The percentage of methylated cytosines (CpG type) among all cytosines in the genome : {pourcentage_C : .2f} %")
print()
return positions_CpG
return positions_par_chromosome
def CHG_context():
global CHG
global nb_C
global nb_CpG
motif_counts = {}
motif_sum = {}
motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
motif_sum = {} #
motif_occurrences = {} # create a dictionary to store occurrences of each motif
positions_par_chromosome = {}
for motif in CHG:
motif_counts[motif] = 0
motif_sum[motif] = 0
......@@ -61,24 +88,27 @@ def CHG_context():
motif_counts[motif] += motif_count
if motif_count > 0 and motif == "CCG":
motif_occurrences[motif].append(i)
row = treated_df.iloc[i]
row = treated_df.iloc[i] # Store the raw
rows.append(row)
if len(rows) != 0:
rows_df = pd.DataFrame(rows)
rows_CHG_df = pd.DataFrame(rows)
rows_CHG = rows_CHG_df.sort_values("position")
if not rows_CHG.empty:
for chromosome in rows_CHG['chromosome'].unique():
positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, 'position'])
total_count = sum(motif_counts.values())
pourcentage_C_total = (total_count / nb_C) * 100
print(" --- Statistiques de CHG --- ")
print(" --- CHG STATISTICS --- ")
print()
print(
f"Le pourcentage des cytosines méthylés (type CHG) parmi tous les cytosines du génome de référence {pourcentage_C_total: .2f} %")
f"The percentage of methylated cytosines (CHG type) among all cytosines in the genome: {pourcentage_C_total: .2f} %")
print()
print(f"Le nombre d'occurence de la méthylation CHG : {total_count}, dont : ")
print(f" The number of occurrences of CHG methylation : {total_count}, which includes : ")
for motif in CHG:
pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
print(f" -> {motif} apparaît {motif_counts[motif]} fois")
print(f" Le pourcentage des cytosines méthylés (type {motif}) est {pourcentage_C_motif: .2f}%")
print(f" -> {motif} apprears {motif_counts[motif]} times")
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
print()
return rows_df
return positions_par_chromosome
def CHH_context():
......@@ -90,10 +120,10 @@ def CHH_context():
motif_counts[motif] = 0
motif_sum[motif] = 0
# Boucler à travers chaque k_mer dans methylated dataset
# Loop through k_mer column of treated dataset
for kmer in methylated["k_mer"][2:]:
sub_kmer = kmer[2:]
for kmer in treated_df["k_mer"]:
sub_kmer = kmer[0:]
sub_kmer_str = "".join(sub_kmer)
for motif in CHH:
motif_count = sub_kmer_str.count(motif)
......@@ -101,82 +131,86 @@ def CHH_context():
motif_sum[motif] += motif_count
total_count = sum(motif_counts.values())
pourcentage_C_total = (total_count / nb_C) * 100
print(" --- Statistiques de CHH --- ")
print(" --- CHH STATISTICS --- ")
print()
print(f"Le pourcentage des cytosines méthylés (type CHH) parmi tous les cytosines du génome de référence {pourcentage_C_total: .2f} %")
print(f"The percentage of methylated cytosines (CHH type) among all cytosines in the genome: {pourcentage_C_total: .2f} %")
print()
print(f"Le nombre d'occurence de la méthylation CHH : {total_count}, dont : ")
print(f"Le nombre d'occurence de la méthylation CHH : {total_count}, which includes : ")
for motif in CHH :
pourcentage_C_motif = (motif_sum[motif] / nb_C) * 100
print(f" -> {motif} apparaît {motif_counts[motif]} fois")
print(f" Le pourcentage des cytosines méthylés (type {motif}) est {pourcentage_C_motif: .2f}%")
print(f" -> {motif} appears {motif_counts[motif]} times")
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
print()
def successives_CpG_CCG():
global filebasename
global output_dir
rows_CpG = CpG_context()
rows_CHG = CHG_context()
results = []
for key in set(rows_CpG.keys()) & set(rows_CHG.keys()):
positions_CpG = sorted(rows_CpG[key])
positions_CHG = sorted(rows_CHG[key])
for i in range(len(positions_CpG)):
for j in range(len(positions_CHG)):
if abs(positions_CpG[i] - positions_CHG[j]) == 1:
results.append({"Chromosome": key, "CCG position": positions_CHG[j], "CpG position": positions_CpG[i]})
#print( " In " + key + " CCG methylation at " + str(positions_CHG[j]) + " and CpG methylation at " + str(positions_CpG[i]) + " are successive ")
df = pd.DataFrame(results, columns=["Chromosome", "CCG position", "CpG position"])
output_file = os.path.join(output_dir, filebasename + "_successive_CpG_CCG.csv")
df.to_csv(output_file, index=False, sep ='\t')
print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####")
# list of the three methylation contexts
def CpG_CCG_successive():
positions_CHG = CHG_context(methylated)
positions_CpG = CpG_context(methylated)
successive_values = []
for i in range(len(positions_CHG)):
if i < len(positions_CpG):
if abs(positions_CHG[i] - positions_CpG[i]) == 1:
successive_values.append((positions_CHG[i], positions_CpG[i]))
if i > 0:
if abs(positions_CHG[i] - positions_CHG[i - 1]) == 1:
successive_values.append((positions_CHG[i - 1], positions_CHG[i]))
if abs(positions_CpG[i - 1] - positions_CpG[i]) == 1:
successive_values.append((positions_CpG[i - 1], positions_CpG[i]))
successive_values = sorted(successive_values, key=lambda x: x[0])
for x, y in successive_values:
print("Successive positions: CCG = {}, CpG = {}".format(x, y))
print("####-------- CALCUL DES STATISTIQUES DE METHYLATION A PARTIR DE FICHIER TSV GENERE PAR DeepSignal-Plant --------####")
print()
# Utilisation de glob pour pouvoir tourner le script sur tous les fichiers tsv.
path = r"/home/fadwa/Téléchargements/test"
filenames = glob.glob(path + '/*.tsv')
# Déclaration des trois contexte de méthylation
CpG = ["CG"]
CHG = ["CCG", "CAG", "CTG"]
CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]
seuil = 5
nb_C = 21551308
nb_CpG= 5604558
path = args.i
nb_C = args.c
nb_CpG = args.cpg
seuil = args.t
output_dir = args.o
# Lecture de fichier
# read files
for file in filenames:
print(f"Statistiques pour le fichier {file} ")
names = ["Chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"] #Ajout des entêtes des colonnes
call_mods = pd.read_csv(file, sep='\t', names=names)
files = path + "/*.tsv" # store the path to files
for filename in glob.glob(files): # to select all files present in that path
filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
print()
print(f" Statistics for the file : {filebasename} ")
print()
names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"] #Ajout des entêtes des colonnes
call_mods = pd.read_csv(filename, sep='\t', names=names) # read files
# Création du DataFrame
# DataFrame Creation
df = pd.DataFrame(call_mods)
# Nombre des reads méthylés :
# mean coverage :
couverture= df["coverage"].mean()
couverture_int =int(couverture)
print(f"La couverture moyenne est : {couverture_int}")
print(f" The mean coverage : {couverture_int}")
print()
# Extraction des reads méthylés en fonction avec un seuil sup ou égal à 5 reads mappés.
print(" %%% -- Les statistiques suivantes sont effectuées sur un dataset \n dont le nombre de reads dans lesquels la base ciblée est considérée \n comme modifiée est supérieur ou égal à 5 -- %%%")
# filtring dataset
print(f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
print()
methyl= df[df["count_modified"] >= seuil]
methylated=pd.DataFrame(methyl) #Création du dataframe methylated contenant
methylated=pd.DataFrame(methyl) # save only the methylated reads.
#methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
treated_df = methylated[['position', 'k_mer']]
treated_df = methylated[['chromosome', 'position', 'k_mer']] #filter dataset
treated_df = pd.DataFrame(treated_df)
treated_df['k_mer'] = treated_df['k_mer'].str[2:]
treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
#CpG_context()
#CHG_context()
successives_CpG_CCG()
#CHH_context()
CpG_context()
CHG_context()
CHH_context()
#CpG_CCG_successive()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment