From 346756b983352276e56aeef7019f8fd838afec36 Mon Sep 17 00:00:00 2001 From: NMarthe <nina.marthe34@gmail.com> Date: Wed, 17 May 2023 13:39:52 +0200 Subject: [PATCH] corrections et ajouts dans les statistiques de transfert --- ClassSegFeat.py | 6 + Functions.py | 348 ++++++++++++++++++++++++++++++++++-------------- main.py | 12 +- 3 files changed, 265 insertions(+), 101 deletions(-) diff --git a/ClassSegFeat.py b/ClassSegFeat.py index 92ac81f..5486a93 100644 --- a/ClassSegFeat.py +++ b/ClassSegFeat.py @@ -7,6 +7,10 @@ class Segment: self.chr=chr self.start=int(start) self.stop=int(stop) + + # segments précedent et suivant. + # séquence. + # cf odgi objets self.size=self.stop-self.start+1 @@ -30,6 +34,8 @@ class Feature: self.childs=childs # liste of childs features (exons, cds, etc) self.parent=parent self.segments_list=seg # list of oriented segment on which the feature is (-s1/+s1, depending on the strand) + + self.note="" def __str__(self): if self.parent=="": diff --git a/Functions.py b/Functions.py index 43f4236..6e7b434 100644 --- a/Functions.py +++ b/Functions.py @@ -1,4 +1,5 @@ import ClassSegFeat as Class +import subprocess # functions to create the segments and the features @@ -73,6 +74,21 @@ def add_seg(feat,new_seg,strand): Features[feat].segments_list.append(seg_stranded) +def set_note(id): + feat=Features[id] + if feat.type=="gene": + feat.note=feat.annot.split(';')[-1] + else: + curent=feat.parent + ann=0 + while ann==0: + if Features[curent].type=="gene": + fonction=Features[curent].annot.split(';')[-1] + feat.note=fonction + ann=1 + else: + curent=Features[Features[curent].parent].id + # create all the Segment and Feature objects in the dictionnaries Segments and Features def create_seg_feat(intersect_path): @@ -92,6 +108,7 @@ def create_seg_feat(intersect_path): if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list init_feature(line) + #set_function(feature_id) else: # add the current segment to the list of segments that have the existing feature strand=line.split()[10] add_seg(feature_id,segment_id,strand) @@ -103,6 +120,9 @@ def create_seg_feat(intersect_path): strand=line.split()[10] add_feature(segment_id,feature_id,strand) + for feat_id in Features: + set_note(feat_id) + file.close() @@ -253,8 +273,14 @@ def genome_gff(pos_seg, gff, out): gff=open(gff,'r') file_out = open(out, 'w') - + file_out_alt=open("azucena_chr10_alt.gff",'w') + file_out2 = open("segments_manquants.txt", 'w') + lines=gff.readlines() + + diff_list=list() + + stats=True # get the list of all the features to transfer list_feature=list() @@ -264,61 +290,135 @@ def genome_gff(pos_seg, gff, out): list_feature.append(id) # stats on how many segments are absent in azucena, their average length, etc - stats=False if stats==True: seg_first=list() seg_middle=list() seg_last=list() - seg_total=list() + seg_total_abs=list() seg_ok=list() seg_entier=list() + seg_total=list() + feature_missing_first=list() feature_missing_middle=list() feature_missing_last=list() feature_missing_all=list() feature_missing_total=list() + feature_total=list() + feature_ok=list() + + #size_seg_missing=list() + + + + - # for each feature, get list of the segments where it is + # for each feature, get list of the segments where it is and the first and last segment of the feature on the genome for feat in list_feature: list_seg=Features[feat].segments_list size_list=len(list_seg) + + segments_missing=list() + + first_seg="0" + for segment in list_seg: + if segment[1:] in seg_ac: + first_seg=segment[1:] + break + last_seg="0" + for segment in reversed(list_seg): + if segment[1:] in seg_ac: + last_seg=segment[1:] + break + # stats on how many segments are absent in azucena, their average length, etc - if stats==True: + + # for the segments + # manque au début-seg_first + # manque à la fin-seg_last + # manque au milieu-seg_middle + # début milieu et fin, donc feature entier ?-seg_entier + # nb total de segments dans un gène et pas chez azucena-seg_total_abs + # présents chez azucena-seg_ok + # segments total dans une feature (présent ou pas dans azucena)-seg_total + + if (stats==True): + for segment in list_seg: seg_len=Segments[segment[1:]].size if segment[1:] not in seg_ac: seg_total.append(seg_len) - if feat not in feature_missing_total: - feature_missing_total.append(feat) + segments_missing.append(Segments[segment[1:]]) + #if feat not in feature_missing_total: + # feature_missing_total.append(feat) if segment==list_seg[0]: - if segment==list_seg[size_list-1]: - seg_entier.append(seg_len) - if feat not in feature_missing_all: - feature_missing_all.append(feat) + if segment==list_seg[-1]: + seg_entier.append(seg_len) # contient les features en un seul segment, absent chez azu. + # if feat not in feature_missing_all: + # feature_missing_all.append(feat) else: seg_first.append(seg_len) - if feat not in feature_missing_first: - feature_missing_first.append(feat) - elif segment==list_seg[size_list-1]: + # if feat not in feature_missing_first: + # feature_missing_first.append(feat) + elif segment==list_seg[-1]: seg_last.append(seg_len) - if feat not in feature_missing_last: - feature_missing_last.append(feat) + # if feat not in feature_missing_last: + # feature_missing_last.append(feat) else: seg_middle.append(seg_len) - if feat not in feature_missing_middle: - feature_missing_middle.append(feat) + #if feat not in feature_missing_middle: + # feature_missing_middle.append(feat) else: seg_ok.append(seg_len) - - gene_start="empty" - seg_start="empty" - seg_stop="empty" - + + # for the features : + # manque le premier->feature_missing_first + # manque le dernier->feature_missing_last + # trou au milieu->feature_missing_middle + # absent totalement (0) ->feature_missing_all + # au moins un segment qui manque->feature_missing_total + # features completes chez azucena->feature_ok + # nb features total->feature_total + + if (stats==True): + feature_total.append(feat) + # premier dernier ou aucun seg présent : + if (first_seg=='0') & (last_seg=='0') : # aucun des segments de la feature n'est dans azucena. + feature_missing_all.append(feat) + elif first_seg != list_seg[0][1:]: # le premier segment est manquant chez azucena + feature_missing_first.append(feat) + elif last_seg!=list_seg[-1][1:]: # le dernier segment est manquant chez azucena + feature_missing_last.append(feat) + + # parcourir les segments, voir s'ils manquent au milieu + elif (len(list_seg)!=1) & (feat not in feature_missing_all): # pour pouvoir accéder à l'avant dernier élément + for segment in list_seg[1-(len(list_seg)-2)]: + if segment not in seg_ac: + feature_missing_middle.append(feat) + break + + + # parcourir les segments, voir s'il en manque un n'importe où + for segment in list_seg: + if segment[1:] not in seg_ac: + if feat not in feature_missing_total: + feature_missing_total.append(feat) + break + + if feat not in feature_missing_total: + feature_ok.append(feat) + + + # outputs each fragment of each feature, with its position on the new genome and its annotation : # loop that goes through all the segments that have the current feature # keeps the first segment present in azucena found, and when it finds a segment absent in azucena, prints the piece of the fragment, and resets the first segment present. # at the end of the loop, prints the last piece of the fragment. + gene_start="empty" + seg_start="empty" + seg_stop="empty" + for i in range(0,size_list): if list_seg[i][1:] in seg_ac: if gene_start=="empty": @@ -344,35 +444,91 @@ def genome_gff(pos_seg, gff, out): if list_seg[i][1:] in seg_ac: out_line=write_line(list_seg,i+1,feat,seg_start,gene_start,seg_ac) file_out.write(out_line) + + + # outputs each feature once, from the first to the last segment present on the new genome and its annotation : + if (first_seg!='0') & (last_seg!='0'): + chr=seg_ac[first_seg][0] + strand=list_seg[i-1][0:1] + ligne=chr+" graph_gff "+Features[feat].type+" "+str(get_start_2(first_seg,seg_ac,feat))+" "+str(get_stop_2(last_seg,seg_ac,feat))+" . "+strand+" . "+Features[feat].annot+"\n" + file_out_alt.write(ligne) + + + # outputs the detail of the missing fragments for each feature : + + for segment in list_seg: + if segment[1:] not in seg_ac: + segments_missing.append(Segments[segment[1:]]) + + if (len(segments_missing)!=0) & (first_seg!='0') & (last_seg!='0'): + + #print(first_seg,last_seg) + start2=get_start_2(first_seg,seg_ac,feat) + stop2=get_stop_2(last_seg,seg_ac,feat) + size_on_genome=int(stop2)-int(start2)+1 + size_diff=size_on_genome-Features[feat].size + diff_list.append(int(size_diff)) + + ligne=feat+" : "+str(len(segments_missing))+" variations. différence de taille (taille dans le nouveau génome-taille dans nipponbare) : "+str(size_diff)+"\n" + #taille du gene à l'origine : "+str(Features[feat].size)+", taille du gene sur ce génome : "+str(size_on_genome)+"\n" + file_out2.write(ligne) + + for segment in segments_missing: + if (segment.id==Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#premier et pas dernier + ligne="premier segment du gène manquant.\n" + elif (segment.id==Features[feat].segments_list[-1][1:]) & ((segment.id!=Features[feat].segments_list[0][1:])):#dernier et pas premier + ligne="dernier segment du gène manquant.\n" + elif (segment.id!=Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#ni premier ni dernier + ligne="variation au milieu du gène, de taille "+str(segment.size)+"\n" + # voir si c'est substitution ou indel + # print ce qu'il y a en face dans nb + + #size_seg_missing.append(segment.size) + + + + file_out2.write(ligne) + file_out2.write("\n") + else: + ligne=feat+": aucune variation.\n\n" + file_out2.write(ligne) + + + + + #print("difference moyenne : "+str(sum(diff_list)/len(diff_list))) + #import matplotlib.pyplot as plt + #plt.hist(diff_list, range=(-1000,1000)) + #plt.show() + #print("segment au millieu taille moyenne : "+str(sum(size_seg_missing)/len(size_seg_missing))) + file_out.close() gff.close() + file_out2.close() + file_out_alt.close() + # print stats from statistics import median, mean if stats==True: - print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first)) - print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle)) - multiple_3=0 - for seg in seg_middle: - if seg%3==0:multiple_3+=1 - print(multiple_3,"segments missing in the middle of a feature, of length=3k") - print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last)) - print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier)) - print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total)) - print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok)) - - print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(117+5)/153,2),"% hypothetical or putative.") - print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(180+3)/263,2),"% hypothetical or putative.") - print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(115+7)/156,2),"% hypothetical or putative.") - print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(314+24)/404,2),"% hypothetical or putative.") - print("there is at least one segment missing", len(feature_missing_total) ,"features, including",round(100*(606+36)/817,2),"% hypothetical or putative.") - print("there is", len(list_feature) ,"features in total, including",round(100*(2108+171)/3510,2),"% hypothetical or putative.") + + # prints the stats for the segments + #print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first)) + #print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle)) + #print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last)) + #print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier)) + #print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total)) + #print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok)) + # creates files with the features by category feat_miss_first=open('features_missing_first.txt','w') feat_miss_first.write("features missing the first segment on azucena :\n") for first in feature_missing_first : feat_miss_first.write(Features[first].annot) + if Features[first].type!='gene': + feat_miss_first.write(';') + feat_miss_first.write(Features[first].note) feat_miss_first.write("\n") feat_miss_first.close() @@ -380,6 +536,9 @@ def genome_gff(pos_seg, gff, out): feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n") for middle in feature_missing_middle : feat_miss_middle.write(Features[middle].annot) + if Features[middle].type!='gene': + feat_miss_middle.write(';') + feat_miss_middle.write(Features[middle].note) feat_miss_middle.write("\n") feat_miss_middle.close() @@ -387,6 +546,9 @@ def genome_gff(pos_seg, gff, out): feat_miss_last.write("\nfeatures missing the last segment on azucena :\n") for last in feature_missing_last : feat_miss_last.write(Features[last].annot) + if Features[last].type!='gene': + feat_miss_last.write(';') + feat_miss_last.write(Features[last].note) feat_miss_last.write("\n") feat_miss_last.close() @@ -394,82 +556,74 @@ def genome_gff(pos_seg, gff, out): feat_miss.write("\nfeatures missing entirely on azucena :\n") for entier in feature_missing_all : feat_miss.write(Features[entier].annot) + if Features[entier].type!='gene': + feat_miss.write(';') + feat_miss.write(Features[entier].note) feat_miss.write("\n") feat_miss.close() feat_miss_total=open('features_missing_total.txt','w') - feat_miss_total.write("\nfeatures missing a segment on azucena :\n") + feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n") for total in feature_missing_total : feat_miss_total.write(Features[total].annot) + if Features[total].type!='gene': + feat_miss_total.write(';') + feat_miss_total.write(Features[total].note) feat_miss_total.write("\n") feat_miss_total.close() feat_all=open('all_features.txt','w') for feat in list_feature: feat_all.write(Features[feat].annot) + if Features[feat].type!='gene': + feat_all.write(';') + feat_all.write(Features[feat].note) feat_all.write("\n") feat_all.close() - + feat_ok=open('features_ok.txt','w') + feat_ok.write("\nfeatures entirely present in azucena :\n") + for ok in feature_ok : + feat_ok.write(Features[ok].annot) + if Features[ok].type!='gene': + feat_ok.write(';') + feat_ok.write(Features[ok].note) + feat_ok.write("\n") + feat_ok.close() + # prints the stats for the features (hypothetical/putative rate, by category) + hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",)) + total=int(subprocess.getoutput("wc -l < features_missing_first.txt")) + print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/len(feature_missing_first),2),"% hypothetical or putative.") - # writes the gff of azucena using the gff of the graph -def genome_gff_test(pos_seg, gff, out): - - print("generation of the genome's gff ") - - # create a dictionnary with the positions of the segments on azucena - seg_ac={} - bed=open(pos_seg,'r') - lines=bed.readlines() - for line in lines: - line=line.split() - s_id=line[3][1:] - ch=line[0] - start=line[1] - stop=line[2] - seg_ac[s_id]=list([ch,start,stop]) - bed.close() - - gff=open(gff,'r') - file_out = open(out, 'w') - - lines=gff.readlines() + hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",)) + total=int(subprocess.getoutput("wc -l < features_missing_middle.txt")) + print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/len(feature_missing_middle),2),"% hypothetical or putative.") + + hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",)) + total=int(subprocess.getoutput("wc -l < features_missing_last.txt")) + print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/len(feature_missing_last),2),"% hypothetical or putative.") + + hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",)) + total=int(subprocess.getoutput("wc -l < features_missing.txt")) + print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/len(feature_missing_all),2),"% hypothetical or putative.") + + hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",)) + total=int(subprocess.getoutput("wc -l < features_missing_total.txt")) + print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/len(feature_missing_total),2),"% hypothetical or putative.") + + hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",)) + total=int(subprocess.getoutput("wc -l < features_ok.txt")) + print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/len(feature_ok),2),"% hypothetical or putative.") + + hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",)) + total=int(subprocess.getoutput("wc -l < all_features.txt")) + print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/len(list_feature),2),"% hypothetical or putative.") + + command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt" + subprocess.run(command,shell=True) - # get the list of all the features to transfer - list_feature=list() - for line in lines: - id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_")) - if id not in list_feature: - list_feature.append(id) - - - # for each feature, get list of the segments where it is - for feat in list_feature: - list_seg=Features[feat].segments_list - size_list=len(list_seg) - - gene_start="empty" - seg_start="empty" - seg_stop="empty" - - # prints all gene fragments. - for i in range(0,size_list): - - if (list_seg[i][1:] in seg_ac) & (1==1): - feature=Features[feat] - seg_start=list_seg[i][1:] - seg_stop=list_seg[i][1:] - chr=seg_ac[seg_start][0] - start_azu=get_start_2(seg_start,seg_ac,feature.id) - stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) - out_line=chr+" "+str(start_azu)+" "+str(stop_azu)+" "+seg_start+" "+seg_stop+" "+feature.id+"\n" - file_out.write(out_line) - - file_out.close() - gff.close() - diff --git a/main.py b/main.py index 6d3ec68..36c8875 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,8 @@ import Functions as fct # creates segments and features for the intersect between the graph for chr10 and the gff of IRGSP -intersect_path='/home/nina/annotpangenome/align_genes/intersect_segments-features_chr10.bed' +intersect_path='/home/nina/annotpangenome/align_genes/intersect_segments-genes_chr10.bed' +#intersect_path='/home/nina/annotpangenome/align_genes/intersect_reel_genes_chr10.bed' fct.create_seg_feat(intersect_path) # outputs the gff of the graph for chr10 @@ -11,11 +12,14 @@ fct.graph_gff(output_file) gff="new_graph_chr10.gff" -if 1==0: +genome = 'ac' + +if genome=='ac': # transfer grom graph to azucena pos_seg="seg_coord/AzucenaRS1_chromosome10.bed" - out="azucena_chr10_all.gff" + #out="azucena_chr10_all.gff" + out="azucena_chr10.gff" -if 1==1: +if genome=='nb': # transfer from graph to nipponbare out="nb_chr10_all.gff" pos_seg="seg_coord/IRGSP-1_0_Chr10.bed" -- GitLab