Skip to content
Snippets Groups Projects
Commit 377f1657 authored by fadwael.khaddar_ird.fr's avatar fadwael.khaddar_ird.fr
Browse files

Modifications

parent 4d03aeb6
No related branches found
No related tags found
No related merge requests found
......@@ -9,6 +9,7 @@ import glob
import sys
import matplotlib as plt
import argparse
import re
# Arguments
......@@ -40,19 +41,26 @@ def CpG_context():
# Loop through k_mer column of treated dataset
for i, first_two_letters in zip(range(len(treated_df["k_mer"])), treated_df["k_mer"].str.slice(0, 2)):
for motif in CpG:
motif_count = first_two_letters.count(motif)
motif_counts[motif] += motif_count
for i, row in treated_df.iterrows():
kmer = row['k_mer']
first_two_letters = kmer[:2]
if "CG" in first_two_letters:
motif_count = first_two_letters.count("CG")
motif_counts["CG"] += motif_count
if motif_count > 0:
motif_occurrences[motif].append(i)
row = treated_df.iloc[i]
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
re.finditer("CG", first_two_letters)] # Store the positions of the motif
motif_occurrences["CG"].extend(motif_positions)
rows.append(row)
rows_CpG_df = pd.DataFrame(rows)
rows_CpG = rows_CpG_df.sort_values("position")
if not rows_CpG.empty:
for chromosome in rows_CpG['chromosome'].unique():
positions_par_chromosome[chromosome] = list(rows_CpG.loc[rows_CpG['chromosome'] == chromosome, 'position'])
rows_CG_df = pd.DataFrame(rows)
rows_CG = rows_CG_df.sort_values("chromosome")
print(rows_CG)
output_file = os.path.join(output_dir, filebasename + "_CG.csv")
rows_CG.to_csv(output_file, index=False, sep='\t')
if not rows_CG.empty:
for chromosome in rows_CG['chromosome'].unique():
positions_par_chromosome[chromosome] = list(rows_CG.loc[rows_CG['chromosome'] == chromosome, ['strand', 'position']])
total_count = sum(motif_counts.values())
pourcentage_CpG = (total_count / nb_CpG) * 100
pourcentage_C = (total_count / nb_C) * 100
......@@ -72,6 +80,8 @@ def CHG_context():
global CHG
global nb_C
global nb_CpG
global output_dir
global filebasename
motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
motif_sum = {} #
motif_occurrences = {} # create a dictionary to store occurrences of each motif
......@@ -82,19 +92,22 @@ def CHG_context():
motif_sum[motif] = 0
motif_occurrences[motif] = [] # initialize an empty list for each motif
rows = []
for i, kmer in enumerate(treated_df["k_mer"]):
for i, row in treated_df.iterrows():
kmer= row['k_mer']
for motif in CHG:
motif_count = kmer.count(motif)
motif_counts[motif] += motif_count
if motif_count > 0 and motif == "CCG":
motif_occurrences[motif].append(i)
row = treated_df.iloc[i] # Store the raw
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in re.finditer(motif, kmer)] # Store the positions of the motif
motif_occurrences[motif].extend(motif_positions)
rows.append(row)
rows_CHG_df = pd.DataFrame(rows)
rows_CHG = rows_CHG_df.sort_values("position")
rows_CHG = rows_CHG_df.sort_values("chromosome")
output_file = os.path.join(output_dir, filebasename + "_CCG.csv")
rows_CHG.to_csv(output_file, index=False, sep='\t')
if not rows_CHG.empty:
for chromosome in rows_CHG['chromosome'].unique():
positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, 'position'])
positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, ['strand', 'position']])
total_count = sum(motif_counts.values())
pourcentage_C_total = (total_count / nb_C) * 100
print(" --- CHG STATISTICS --- ")
......@@ -142,23 +155,8 @@ def CHH_context():
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
print()
def successives_CpG_CCG():
global filebasename
global output_dir
rows_CpG = CpG_context()
rows_CHG = CHG_context()
results = []
for key in set(rows_CpG.keys()) & set(rows_CHG.keys()):
positions_CpG = sorted(rows_CpG[key])
positions_CHG = sorted(rows_CHG[key])
for i in range(len(positions_CpG)):
for j in range(len(positions_CHG)):
if abs(positions_CpG[i] - positions_CHG[j]) == 1:
results.append({"Chromosome": key, "CCG position": positions_CHG[j], "CpG position": positions_CpG[i]})
#print( " In " + key + " CCG methylation at " + str(positions_CHG[j]) + " and CpG methylation at " + str(positions_CpG[i]) + " are successive ")
df = pd.DataFrame(results, columns=["Chromosome", "CCG position", "CpG position"])
output_file = os.path.join(output_dir, filebasename + "_successive_CpG_CCG.csv")
df.to_csv(output_file, index=False, sep ='\t')
def hemi_methylated_cytosine():
print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####")
......@@ -205,12 +203,13 @@ for filename in glob.glob(files): # to select all files present in that path
methylated=pd.DataFrame(methyl) # save only the methylated reads.
#methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
treated_df = methylated[['chromosome', 'position', 'k_mer']] #filter dataset
treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
treated_df = pd.DataFrame(treated_df)
treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
#CpG_context()
#CHG_context()
successives_CpG_CCG()
#CHH_context()
CpG_context()
CHG_context()
#successives_CpG_CCG()
CHH_context()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment