Skip to content
Snippets Groups Projects
Commit 24983914 authored by fadwael.khaddar_ird.fr's avatar fadwael.khaddar_ird.fr
Browse files

Ajout de l'analyse par Chunk

parent 159e786e
No related branches found
No related tags found
No related merge requests found
####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
###-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
# Autor: Fadwa EL KHADDAR
# Lab : DIADE - IRD
# University : Montpellier - France
......@@ -6,8 +6,6 @@
import os
import subprocess
import numpy as np
import pandas as pd
import glob
import sys
import matplotlib
import matplotlib.pyplot as plt
......@@ -16,37 +14,43 @@ import argparse
import re
from tkinter import *
import tkinter as tk
import glob
import pandas as pd
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
matplotlib.use('TkAgg')
# Arguments
parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
parser.add_argument('-fasta', type=str, help="Path to fasta file")
parser.add_argument('-fasta', type=str, help="Path to fasta file" )
parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
parser.add_argument('-o', type=str, help='Output directory ')
args = parser.parse_args()
path = args.i
fasta = args.fasta
seuil = args.t
output_dir = args.o
# Function definition
def CpG_context(df_chromosome):
def CpG_context():
global CpG
global nb_C
global nb_CpG
motif_counts = {}
motif_sum = {} # Dictionnary to store count all occurences
motif_sum = {} # Dictionnary to store count all occurences
motif_occurrences = {} # Dictionnary to store count of each occurence
positions_par_chromosome = {}
for motif in CpG:
motif_counts[motif] = 0
motif_sum[motif] = 0
motif_occurrences[motif] = []
rows = []
rows= []
# Loop through k_mer column of treated dataset
# Loop through k_mer column of treated dataset
for i, row in treated_df.iterrows():
kmer = row['k_mer']
......@@ -59,11 +63,11 @@ def CpG_context(df_chromosome):
re.finditer("CG", first_two_letters)] # Store the positions of the motif
motif_occurrences["CG"].extend(motif_positions)
rows.append(row)
rows_CG_df = pd.DataFrame(rows)
rows_CG = rows_CG_df.sort_values("chromosome")
output_file = os.path.join(output_dir, filebasename + "_CG.csv")
rows_CG.to_csv(output_file, index=False, sep='\t')
# if not rows_CG.empty:
#rows_CG_df = pd.DataFrame(rows)
#rows_CG = rows_CG_df.sort_values("chromosome")
#output_file = os.path.join(output_dir, filebasename + "_CG.csv")
#rows_CG.to_csv(output_file, index=False, sep='\t')
#if not rows_CG.empty:
# for chromosome in rows_CG['chromosome'].unique():
# positions_par_chromosome[chromosome] = list(rows_CG.loc[rows_CG['chromosome'] == chromosome, ['strand', 'position']])
total_count_CpG = sum(motif_counts.values())
......@@ -81,15 +85,14 @@ def CpG_context(df_chromosome):
return total_count_CpG
def CHG_context(df_chromosome):
def CHG_context():
global CHG
global nb_C
global nb_CpG
global output_dir
global filebasename
motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
motif_sum = {} #
motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
motif_sum = {} #
motif_occurrences = {} # create a dictionary to store occurrences of each motif
positions_par_chromosome = {}
......@@ -99,20 +102,19 @@ def CHG_context(df_chromosome):
motif_occurrences[motif] = [] # initialize an empty list for each motif
rows = []
for i, row in treated_df.iterrows():
kmer = row['k_mer']
kmer= row['k_mer']
for motif in CHG:
motif_count = kmer.count(motif)
motif_counts[motif] += motif_count
if motif_count > 0:
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
re.finditer(motif, kmer)] # Store the positions of the motif
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in re.finditer(motif, kmer)] # Store the positions of the motif
motif_occurrences[motif].extend(motif_positions)
rows.append(row)
rows_CHG_df = pd.DataFrame(rows)
rows_CHG = rows_CHG_df.sort_values("chromosome")
output_file = os.path.join(output_dir, filebasename + "_CHG.csv")
rows_CHG.to_csv(output_file, index=False, sep='\t')
# if not rows_CHG.empty:
#rows_CHG_df = pd.DataFrame(rows)
#rows_CHG = rows_CHG_df.sort_values("chromosome")
#output_file = os.path.join(output_dir, filebasename + "_CHG.csv")
#rows_CHG.to_csv(output_file, index=False, sep='\t')
#if not rows_CHG.empty:
# for chromosome in rows_CHG['chromosome'].unique():
# positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, ['strand', 'position']])
total_count_CHG = sum(motif_counts.values())
......@@ -133,7 +135,7 @@ def CHG_context(df_chromosome):
return total_count_CHG
def CHH_context(df_chromosome):
def CHH_context():
global CHH
global nb_C
global nb_CpG
......@@ -159,11 +161,11 @@ def CHH_context(df_chromosome):
re.finditer(motif, kmer)] # Store the positions of the motif
motif_occurrences[motif].extend(motif_positions)
rows.append(row)
rows_CHH_df = pd.DataFrame(rows)
rows_CHH = rows_CHH_df.sort_values('chromosome')
output_file = os.path.join(output_dir, filebasename + "_CHH.csv")
rows_CHH.to_csv(output_file, index=False, sep='\t')
# if not rows_CHH.empty:
#rows_CHH_df = pd.DataFrame(rows)
#rows_CHH = rows_CHH_df.sort_values("chromosome")
#output_file = os.path.join(output_dir, filebasename + "_CHH.csv")
#rows_CHH.to_csv(output_file, index=False, sep='\t')
#if not rows_CHH.empty:
# for chromosome in rows_CHH['chromosome'].unique():
# positions_par_chromosome[chromosome] = list(
# rows_CHH.loc[rows_CHH['chromosome'] == chromosome, ['strand', 'position']])
......@@ -183,15 +185,21 @@ def CHH_context(df_chromosome):
return total_count_CHH
def plotting(total_count_CpG_list, total_count_CHG_list, total_count_CHH_list):
global nb_C
CpG = sum(total_count_CpG_list)
CHG = sum(total_count_CHG_list)
CHH = sum(total_count_CHH_list)
def plotting(counts, labels):
fig, ax = plt.subplots(figsize=(8, 6), subplot_kw=dict(aspect="equal"))
fig = plt.figure()
ax = fig.add_subplot(111)
wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
fig, ax = plt.subplots(figsize=(10,6), subplot_kw=dict(aspect="equal"))
nb_unmethylated = nb_C - CpG - CHG - CHH
counts = [CpG, CHG , CHH , nb_unmethylated]
bbox_props = dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
labels = ["CpG", "CHG", "CHH", "unmethylated"]
wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
bbox_props= dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")
for i, p in enumerate(wedges):
......@@ -205,7 +213,21 @@ def plotting(counts, labels):
horizontalalignment=horizontalalignment, **kw)
plt.title(f"Methylation context amoung all cytosines in the sample : {filebasename}")
plt.show()
root = tk.Tk()
root.title("Methylation Statistics")
# Create a canvas to display the figure
canvas = FigureCanvasTkAgg(fig, master=root)
canvas.draw()
canvas.get_tk_widget().pack()
# Create a button to close the window
button = tk.Button(master=root, text="Fermer", command=root.quit)
button.pack()
# Start the tkinter event loop
tk.mainloop()
print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####")
......@@ -215,66 +237,66 @@ CpG = ["CG"]
CHG = ["CCG", "CAG", "CTG"]
CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]
path = args.i
fasta = args.fasta
# nb_CpG = args.cpg
seuil = args.t
output_dir = args.o
# Running seqtk
result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True,
text=True)
if result.stdout.strip():
nb_C = int(result.stdout.strip())
print(f"Le nombre total de cytosine : {nb_C}")
else:
print("Erreur : la commande n'a pas retourné de résultat.")
result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True,
text=True)
if result.stdout.strip():
nb_CpG = int(result.stdout.strip())
print(f"Le nombre total de CpG : {nb_CpG}")
else:
print("Erreur : la commande n'a pas retourné de résultat.")
result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True, text=True)
nb_C = int(result.stdout.strip())
print(f"Le nombre total de cytosine : {nb_C}")
# read files
files = path + "/*.tsv" # store the path to files
chromosomes = []
chromosome_results = {'cpg': 0, 'chg': 0, 'chh': 0}
result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True, text=True)
nb_CpG = int(result.stdout.strip())
print(f"Le nombre total de CpG : {nb_CpG}")
for filename in glob.glob(files):
filebasename = os.path.splitext(os.path.basename(filename))[0]
print()
print(f" Statistics for the file: {filebasename}")
print()
names = ["chromosome", "position", "strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified",
"count_unmodified", "coverage", "modification_frequency", "k_mer"]
call_mods = pd.read_csv(filename, sep='\t', names=names)
df = pd.DataFrame(call_mods)
# read files
total_count_CpG_list = []
total_count_CHH_list = []
total_count_CHG_list = []
couverture = df["coverage"].mean()
couverture_int = int(couverture)
print(f" The mean coverage: {couverture_int}")
print()
files = path + "/*.tsv" # store the path to files
print(
f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
print(f" %%% -- The following statistics are performed on a dataset whose number of reads\n"
f" in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
for filename in glob.glob(files): # to select all files present in that path
filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
print()
methyl = df[df["count_modified"] >= seuil]
treated_df = methyl[['chromosome', 'strand', 'position', 'k_mer']].copy()
treated_df['k_mer'] = treated_df['k_mer'].str[2:]
unique_chromosomes = treated_df['chromosome'].unique()
for chromosome in unique_chromosomes:
df_chromosome = treated_df[treated_df['chromosome'] == chromosome]
result_cpg = CpG_context(df_chromosome)
result_chg = CHG_context(df_chromosome)
result_chh = CHH_context(df_chromosome)
chromosomes.extend(unique_chromosomes)
chromosome_results = {'cpg': result_cpg, 'chg': result_chg, 'chh': result_chh}
counts = [result_cpg, result_chg, result_chh]
labels = ["CpG", "CHG", "CHH"]
plotting(counts, labels)
print(f" Statistics for the file : {filebasename} ")
print()
chunksize= 100000 # You can modify this value to fit your chunksize
names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"] #Ajout des entêtes des colonnes
call_mods = pd.read_csv(filename, sep='\t', names=names, chunksize=chunksize) # read files
chunk_number = 1
for chunk in call_mods:
# DataFrame Creation
df = pd.DataFrame(chunk)
# mean coverage :
#couverture= df["coverage"].mean()
#couverture_int =int(couverture)
#print(f" The mean coverage : {couverture_int}")
#print()
print()
methyl= df[df["count_modified"] >= seuil]
print(f"Processing chunk {chunk_number}")
methylated=pd.DataFrame(methyl) # save only the methylated reads.
#methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
treated_df = pd.DataFrame(treated_df)
treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
total_count_CpG_chunk = CpG_context()
total_count_CHG_chunk = CHG_context()
total_count_CHH_chunk = CHH_context()
total_count_CpG_list.append(total_count_CpG_chunk)
total_count_CHG_list.append(total_count_CHG_chunk)
total_count_CHH_list.append(total_count_CHH_chunk)
chunk_number += 1
total_count_CpG_chunk = sum(total_count_CpG_list)
total_count_CHG_chunk = sum(total_count_CHG_list)
total_count_CHH_chunk = sum(total_count_CHH_list)
#CpG_context()
#CHG_context()
#CHH_context()
plotting(total_count_CpG_list, total_count_CHG_list, total_count_CHH_list)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment