Skip to content
Snippets Groups Projects
Commit 5f53a3ce authored by fadwael.khaddar_ird.fr's avatar fadwael.khaddar_ird.fr
Browse files

Update Statistics_Calculation_DeepSignalPlant

parent 4c12ef1d
No related branches found
No related tags found
No related merge requests found
###-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
# Autor: Fadwa EL KHADDAR
# Lab : DIADE - IRD
# University : Montpellier - France
import os
import subprocess
import numpy as np
import sys
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import argparse
import re
from tkinter import *
import tkinter as tk
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
matplotlib.use('TkAgg')
# Arguments
parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
parser.add_argument('-fasta', type=str, help="Path to fasta file" )
parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
parser.add_argument('-o', type=str, help='Output directory ')
args = parser.parse_args()
# Function definition
def CpG_context():
global CpG
global nb_C
global nb_CpG
motif_counts = {}
motif_sum = {} # Dictionnary to store count all occurences
motif_occurrences = {} # Dictionnary to store count of each occurence
positions_par_chromosome = {}
for motif in CpG:
motif_counts[motif] = 0
motif_sum[motif] = 0
motif_occurrences[motif] = []
rows= []
# Loop through k_mer column of treated dataset
for i, row in treated_df.iterrows():
kmer = row['k_mer']
first_two_letters = kmer[:2]
if "CG" in first_two_letters:
motif_count = first_two_letters.count("CG")
motif_counts["CG"] += motif_count
if motif_count > 0:
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
re.finditer("CG", first_two_letters)] # Store the positions of the motif
motif_occurrences["CG"].extend(motif_positions)
rows.append(row)
rows_CG_df = pd.DataFrame(rows)
rows_CG = rows_CG_df.sort_values("chromosome")
output_file = os.path.join(output_dir, filebasename + "_CG.csv")
rows_CG.to_csv(output_file, index=False, sep='\t')
#if not rows_CG.empty:
# for chromosome in rows_CG['chromosome'].unique():
# positions_par_chromosome[chromosome] = list(rows_CG.loc[rows_CG['chromosome'] == chromosome, ['strand', 'position']])
total_count_CpG = sum(motif_counts.values())
pourcentage_CpG = (total_count_CpG / nb_CpG) * 100
pourcentage_C = (total_count_CpG / nb_C) * 100
print(" --- CpG STATISTICS --- ")
print()
print(f" * The number of occurrences of CpG methylation : {total_count_CpG} ")
print(
f" * The percentage of CpG methylation among all CpGs in the genome : {pourcentage_CpG : .2f} %")
print(
f" * The percentage of methylated cytosines (CpG type) among all cytosines in the genome : {pourcentage_C : .2f} %")
print()
return total_count_CpG
def CHG_context():
global CHG
global nb_C
global nb_CpG
global output_dir
global filebasename
motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
motif_sum = {} #
motif_occurrences = {} # create a dictionary to store occurrences of each motif
positions_par_chromosome = {}
for motif in CHG:
motif_counts[motif] = 0
motif_sum[motif] = 0
motif_occurrences[motif] = [] # initialize an empty list for each motif
rows = []
for i, row in treated_df.iterrows():
kmer= row['k_mer']
for motif in CHG:
motif_count = kmer.count(motif)
motif_counts[motif] += motif_count
if motif_count > 0:
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in re.finditer(motif, kmer)] # Store the positions of the motif
motif_occurrences[motif].extend(motif_positions)
rows.append(row)
rows_CHG_df = pd.DataFrame(rows)
rows_CHG = rows_CHG_df.sort_values("chromosome")
#output_file = os.path.join(output_dir, filebasename + "_CHG.csv")
#rows_CHG.to_csv(output_file, index=False, sep='\t')
#if not rows_CHG.empty:
# for chromosome in rows_CHG['chromosome'].unique():
# positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, ['strand', 'position']])
total_count_CHG = sum(motif_counts.values())
pourcentage_C_total = (total_count_CHG / nb_C) * 100
print(" --- CHG STATISTICS --- ")
print()
print(
f"The percentage of methylated cytosines (CHG type) among all cytosines in the genome: {pourcentage_C_total: .2f} %")
print()
print(f" The number of occurrences of CHG methylation : {total_count_CHG}, which includes : ")
for motif in CHG:
pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
print(f" -> {motif} apprears {motif_counts[motif]} times")
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
print()
return total_count_CHG
def CHH_context():
global CHH
global nb_C
global nb_CpG
global output_dir
global filebasename
motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
motif_sum = {} #
motif_occurrences = {} # create a dictionary to store occurrences of each motif
positions_par_chromosome = {}
for motif in CHH:
motif_counts[motif] = 0
motif_sum[motif] = 0
motif_occurrences[motif] = [] # initialize an empty list for each motif
rows = []
for i, row in treated_df.iterrows():
kmer = row['k_mer']
for motif in CHH:
motif_count = kmer.count(motif)
motif_counts[motif] += motif_count
if motif_count > 0:
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
re.finditer(motif, kmer)] # Store the positions of the motif
motif_occurrences[motif].extend(motif_positions)
rows.append(row)
rows_CHH_df = pd.DataFrame(rows)
rows_CHH = rows_CHH_df.sort_values("chromosome")
#output_file = os.path.join(output_dir, filebasename + "_CHH.csv")
#rows_CHH.to_csv(output_file, index=False, sep='\t')
#if not rows_CHH.empty:
# for chromosome in rows_CHH['chromosome'].unique():
# positions_par_chromosome[chromosome] = list(
# rows_CHH.loc[rows_CHH['chromosome'] == chromosome, ['strand', 'position']])
total_count_CHH = sum(motif_counts.values())
pourcentage_C_total = (total_count_CHH / nb_C) * 100
print(" --- CHH STATISTICS --- ")
print()
print(
f"The percentage of methylated cytosines (CHH type) among all cytosines in the genome: {pourcentage_C_total: .2f} %")
print()
print(f" The number of occurrences of CHH methylation : {total_count_CHH}, which includes : ")
for motif in CHH:
pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
print(f" -> {motif} apprears {motif_counts[motif]} times")
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
print()
return total_count_CHH
def plotting():
global nb_C
CpG = CpG_context()
CHG = CHG_context()
CHH = CHH_context()
fig, ax = plt.subplots(figsize=(10,6), subplot_kw=dict(aspect="equal"))
nb_unmethylated = nb_C - CpG - CHG - CHH
counts = [CpG, CHG, CHH, nb_unmethylated]
labels = ["CpG", "CHG", "CHH", "unmethylated"]
wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
bbox_props= dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")
for i, p in enumerate(wedges):
ang = (p.theta2 - p.theta1) / 2. + p.theta1
y = np.sin(np.deg2rad(ang))
x = np.cos(np.deg2rad(ang))
horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
connectionstyle = "angle,angleA=0,angleB={}".format(ang)
kw["arrowprops"].update({"connectionstyle": connectionstyle})
ax.annotate(f"{labels[i]}: {counts[i]}", xy=(x, y), xytext=(1.35 * np.sign(x), 1.4 * y),
horizontalalignment=horizontalalignment, **kw)
plt.title(f"Methylation context amoung all cytosines in the sample : {filebasename}")
root = tk.Tk()
root.title("Methylation Statistics")
# Create a canvas to display the figure
canvas = FigureCanvasTkAgg(fig, master=root)
canvas.draw()
canvas.get_tk_widget().pack()
# Create a button to close the window
button = tk.Button(master=root, text="Fermer", command=root.quit)
button.pack()
# Start the tkinter event loop
tk.mainloop()
print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####")
# list of the three methylation contexts
CpG = ["CG"]
CHG = ["CCG", "CAG", "CTG"]
CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]
path = args.i
fasta = args.fasta
#nb_CpG = args.cpg
seuil = args.t
output_dir = args.o
# Running seqtk
result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True, text=True)
nb_C = int(result.stdout.strip())
print(f"Le nombre total de cytosine : {nb_C}")
result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True, text=True)
nb_CpG = int(result.stdout.strip())
print(f"Le nombre total de CpG : {nb_CpG}")
# read files
files = path + "/*.tsv" # store the path to files
for filename in glob.glob(files): # to select all files present in that path
filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
print()
print(f" Statistics for the file : {filebasename} ")
print()
names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"] #Ajout des entêtes des colonnes
call_mods = pd.read_csv(filename, sep='\t', names=names) # read files
# DataFrame Creation
df = pd.DataFrame(call_mods)
# mean coverage :
couverture= df["coverage"].mean()
couverture_int =int(couverture)
print(f" The mean coverage : {couverture_int}")
print()
# filtring dataset
print(f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
print()
methyl= df[df["count_modified"] >= seuil]
methylated=pd.DataFrame(methyl) # save only the methylated reads.
#methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
treated_df = pd.DataFrame(treated_df)
treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
#CpG_context()
#CHG_context()
#CHH_context()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment