From 346756b983352276e56aeef7019f8fd838afec36 Mon Sep 17 00:00:00 2001
From: NMarthe <nina.marthe34@gmail.com>
Date: Wed, 17 May 2023 13:39:52 +0200
Subject: [PATCH] corrections et ajouts dans les statistiques de transfert

---
 ClassSegFeat.py |   6 +
 Functions.py    | 348 ++++++++++++++++++++++++++++++++++--------------
 main.py         |  12 +-
 3 files changed, 265 insertions(+), 101 deletions(-)

diff --git a/ClassSegFeat.py b/ClassSegFeat.py
index 92ac81f..5486a93 100644
--- a/ClassSegFeat.py
+++ b/ClassSegFeat.py
@@ -7,6 +7,10 @@ class Segment:
         self.chr=chr
         self.start=int(start)
         self.stop=int(stop)
+
+        # segments prÃ©cedent et suivant.
+        # sÃ©quence.
+        #Â cf odgi objets
         
         self.size=self.stop-self.start+1
         
@@ -30,6 +34,8 @@ class Feature:
         self.childs=childs # liste of childs features (exons, cds, etc)
         self.parent=parent
         self.segments_list=seg #Â list of oriented segment on which the feature is (-s1/+s1, depending on the strand)
+
+        self.note=""
         
     def __str__(self):
         if self.parent=="":
diff --git a/Functions.py b/Functions.py
index 43f4236..6e7b434 100644
--- a/Functions.py
+++ b/Functions.py
@@ -1,4 +1,5 @@
 import ClassSegFeat as Class
+import subprocess
 
 # functions to create the segments and the features
 
@@ -73,6 +74,21 @@ def add_seg(feat,new_seg,strand):
         Features[feat].segments_list.append(seg_stranded)
 
     
+def set_note(id):
+    feat=Features[id]
+    if feat.type=="gene":
+        feat.note=feat.annot.split(';')[-1]
+    else:
+        curent=feat.parent
+        ann=0
+        while ann==0:
+            if Features[curent].type=="gene":
+                fonction=Features[curent].annot.split(';')[-1]
+                feat.note=fonction
+                ann=1
+            else:
+                curent=Features[Features[curent].parent].id
+
 # create all the Segment and Feature objects in the dictionnaries Segments and Features
 def create_seg_feat(intersect_path):    
 
@@ -92,6 +108,7 @@ def create_seg_feat(intersect_path):
     
         if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
             init_feature(line)
+            #set_function(feature_id)
         else: # add the current segment to the list of segments that have the existing feature
             strand=line.split()[10]
             add_seg(feature_id,segment_id,strand)
@@ -103,6 +120,9 @@ def create_seg_feat(intersect_path):
             strand=line.split()[10]
             add_feature(segment_id,feature_id,strand)
 
+    for feat_id in Features:
+        set_note(feat_id)
+
     file.close()
 
 
@@ -253,8 +273,14 @@ def genome_gff(pos_seg, gff, out):
     
     gff=open(gff,'r')
     file_out = open(out, 'w')
-    
+    file_out_alt=open("azucena_chr10_alt.gff",'w')
+    file_out2 = open("segments_manquants.txt", 'w')
+
     lines=gff.readlines()
+
+    diff_list=list()
+
+    stats=True
   
     # get the list of all the features to transfer
     list_feature=list()
@@ -264,61 +290,135 @@ def genome_gff(pos_seg, gff, out):
             list_feature.append(id)
     
     # stats on how many segments are absent in azucena, their average length, etc
-    stats=False
     if stats==True:
         seg_first=list()
         seg_middle=list()
         seg_last=list()
-        seg_total=list()
+        seg_total_abs=list()
         seg_ok=list()
         seg_entier=list()
+        seg_total=list()
+
         feature_missing_first=list()
         feature_missing_middle=list()
         feature_missing_last=list()
         feature_missing_all=list()
         feature_missing_total=list()
+        feature_total=list()
+        feature_ok=list()
+
+        #size_seg_missing=list()
+
+
+    
+    
     
-    #Â for each feature, get list of the segments where it is
+    #Â for each feature, get list of the segments where it is and the first and last segment of the feature on the genome
     for feat in list_feature:
         list_seg=Features[feat].segments_list
         size_list=len(list_seg)
+
+        segments_missing=list()
+
+        first_seg="0"
+        for segment in list_seg:
+            if segment[1:] in seg_ac:
+                first_seg=segment[1:]
+                break
+        last_seg="0"
+        for segment in reversed(list_seg):
+            if segment[1:] in seg_ac:
+                last_seg=segment[1:]
+                break
         
+
         # stats on how many segments are absent in azucena, their average length, etc
-        if stats==True:
+
+        #Â for the segments
+        # manque au dÃ©but-seg_first
+        # manque Ã  la fin-seg_last
+        # manque au milieu-seg_middle
+        # dÃ©but milieu et fin, donc feature entier ?-seg_entier
+        # nb total de segments dans un gÃ¨ne et pas chez azucena-seg_total_abs
+        # prÃ©sents chez azucena-seg_ok
+        # segments total dans une feature (prÃ©sent ou pas dans azucena)-seg_total
+
+        if (stats==True):
+
             for segment in list_seg:
                 seg_len=Segments[segment[1:]].size
                 if segment[1:] not in seg_ac:
                     seg_total.append(seg_len)
-                    if feat not in feature_missing_total:
-                        feature_missing_total.append(feat)
+                    segments_missing.append(Segments[segment[1:]])
+                    #if feat not in feature_missing_total:
+                     #   feature_missing_total.append(feat)
                     if segment==list_seg[0]:
-                        if segment==list_seg[size_list-1]:
-                            seg_entier.append(seg_len)
-                            if feat not in feature_missing_all:
-                                feature_missing_all.append(feat)
+                        if segment==list_seg[-1]:
+                            seg_entier.append(seg_len) # contient les features en un seul segment, absent chez azu.
+                      #      if feat not in feature_missing_all:
+                       #         feature_missing_all.append(feat)
                         else: 
                             seg_first.append(seg_len)
-                            if feat not in feature_missing_first:
-                                feature_missing_first.append(feat)
-                    elif segment==list_seg[size_list-1]:
+                        #    if feat not in feature_missing_first:
+                         #       feature_missing_first.append(feat) 
+                    elif segment==list_seg[-1]:
                         seg_last.append(seg_len)
-                        if feat not in feature_missing_last:
-                            feature_missing_last.append(feat)
+                       # if feat not in feature_missing_last:
+                        #    feature_missing_last.append(feat) 
                     else:
                         seg_middle.append(seg_len)
-                        if feat not in feature_missing_middle:
-                            feature_missing_middle.append(feat)
+                        #if feat not in feature_missing_middle:
+                         #   feature_missing_middle.append(feat)
                 else: 
                     seg_ok.append(seg_len)
-                    
-        gene_start="empty"
-        seg_start="empty"
-        seg_stop="empty"
-        
 
+
+        #Â for the features : 
+        # manque le premier->feature_missing_first
+        #Â manque le dernier->feature_missing_last
+        # trou au milieu->feature_missing_middle
+        # absent totalement (0) ->feature_missing_all
+        # au moins un segment qui manque->feature_missing_total
+        # features completes chez azucena->feature_ok  
+        # nb features total->feature_total
+
+        if (stats==True):
+            feature_total.append(feat)
+            # premier dernier ou aucun seg prÃ©sent :
+            if (first_seg=='0') & (last_seg=='0') : # aucun des segments de la feature n'est dans azucena.
+                feature_missing_all.append(feat)
+            elif first_seg != list_seg[0][1:]: # le premier segment est manquant chez azucena
+                feature_missing_first.append(feat)
+            elif last_seg!=list_seg[-1][1:]: # le dernier segment est manquant chez azucena
+                feature_missing_last.append(feat)
+
+            # parcourir les segments, voir s'ils manquent au milieu
+            elif (len(list_seg)!=1) & (feat not in feature_missing_all): #Â pour pouvoir accÃ©der Ã  l'avant dernier Ã©lÃ©ment
+                for segment in list_seg[1-(len(list_seg)-2)]:
+                    if segment not in seg_ac:
+                        feature_missing_middle.append(feat)
+                        break
+
+
+            #Â parcourir les segments, voir s'il en manque un n'importe oÃ¹
+            for segment in list_seg:
+               if segment[1:] not in seg_ac:
+                   if feat not in feature_missing_total:
+                        feature_missing_total.append(feat)
+                        break
+
+            if feat not in feature_missing_total:
+                feature_ok.append(feat)
+
+
+        # outputs each fragment of each feature, with its position on the new genome and its annotation : 
         # loop that goes through all the segments that have the current feature
         # keeps the first segment present in azucena found, and when it finds a segment absent in azucena, prints the piece of the fragment, and resets the first segment present.
         # at the end of the loop, prints the last piece of the fragment.
+        gene_start="empty"
+        seg_start="empty"
+        seg_stop="empty"
+
         for i in range(0,size_list):
             if list_seg[i][1:] in seg_ac:
                 if gene_start=="empty":
@@ -344,35 +444,91 @@ def genome_gff(pos_seg, gff, out):
         if list_seg[i][1:] in seg_ac:
             out_line=write_line(list_seg,i+1,feat,seg_start,gene_start,seg_ac)
             file_out.write(out_line)
+        
+        
+        # outputs each feature once, from the first to the last segment present on the new genome and its annotation :
+        if (first_seg!='0') & (last_seg!='0'):
+            chr=seg_ac[first_seg][0]
+            strand=list_seg[i-1][0:1]
+            ligne=chr+"  graph_gff   "+Features[feat].type+" "+str(get_start_2(first_seg,seg_ac,feat))+" "+str(get_stop_2(last_seg,seg_ac,feat))+"   .   "+strand+"   .   "+Features[feat].annot+"\n"
+            file_out_alt.write(ligne)
+
+        
+        # outputs the detail of the missing fragments for each feature :
+
+        for segment in list_seg:
+                if segment[1:] not in seg_ac:
+                    segments_missing.append(Segments[segment[1:]])
+
+        if (len(segments_missing)!=0) & (first_seg!='0') & (last_seg!='0'):
+            
+            #print(first_seg,last_seg)
+            start2=get_start_2(first_seg,seg_ac,feat)
+            stop2=get_stop_2(last_seg,seg_ac,feat)
+            size_on_genome=int(stop2)-int(start2)+1
+            size_diff=size_on_genome-Features[feat].size
+            diff_list.append(int(size_diff))
+
+            ligne=feat+" : "+str(len(segments_missing))+" variations. diffÃ©rence de taille (taille dans le nouveau gÃ©nome-taille dans nipponbare) : "+str(size_diff)+"\n"
+            #taille du gene Ã  l'origine : "+str(Features[feat].size)+", taille du gene sur ce gÃ©nome : "+str(size_on_genome)+"\n"
+            file_out2.write(ligne)
+
+            for segment in segments_missing:
+                if (segment.id==Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#premier et pas dernier
+                    ligne="premier segment du gÃ¨ne manquant.\n"
+                elif (segment.id==Features[feat].segments_list[-1][1:]) & ((segment.id!=Features[feat].segments_list[0][1:])):#dernier et pas premier
+                    ligne="dernier segment du gÃ¨ne manquant.\n"
+                elif (segment.id!=Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#ni premier ni dernier
+                    ligne="variation au milieu du gÃ¨ne, de taille "+str(segment.size)+"\n"
+                    # voir si c'est substitution ou indel
+                    # print ce qu'il y a en face dans nb
+
+                    #size_seg_missing.append(segment.size)
+
+
+
+                file_out2.write(ligne)
+            file_out2.write("\n")
+        else:
+            ligne=feat+": aucune variation.\n\n"
+            file_out2.write(ligne)
+        
+
+
+
+    #print("difference moyenne : "+str(sum(diff_list)/len(diff_list)))
+    #import matplotlib.pyplot as plt
+    #plt.hist(diff_list, range=(-1000,1000))
+    #plt.show()
+    #print("segment au millieu taille moyenne : "+str(sum(size_seg_missing)/len(size_seg_missing)))
+    
 
     file_out.close()
     gff.close() 
+    file_out2.close()
+    file_out_alt.close()
+
     
     # print stats
     from statistics import median, mean
     if stats==True:
-        print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
-        print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
-        multiple_3=0
-        for seg in seg_middle:
-            if seg%3==0:multiple_3+=1
-        print(multiple_3,"segments missing in the middle of a feature, of length=3k")
-        print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
-        print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
-        print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
-        print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
-    
-        print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(117+5)/153,2),"% hypothetical or putative.")
-        print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(180+3)/263,2),"% hypothetical or putative.")
-        print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(115+7)/156,2),"% hypothetical or putative.")
-        print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(314+24)/404,2),"% hypothetical or putative.")
-        print("there is at least one segment missing", len(feature_missing_total) ,"features, including",round(100*(606+36)/817,2),"% hypothetical or putative.")
-        print("there is", len(list_feature) ,"features in total, including",round(100*(2108+171)/3510,2),"% hypothetical or putative.")
+
+        # prints the stats for the segments
+        #print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
+        #print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
+        #print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
+        #print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
+        #print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
+        #print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
     
+        #Â creates files with the features by category
         feat_miss_first=open('features_missing_first.txt','w')
         feat_miss_first.write("features missing the first segment on azucena :\n")
         for first in feature_missing_first :
             feat_miss_first.write(Features[first].annot)
+            if Features[first].type!='gene':
+                feat_miss_first.write(';')
+                feat_miss_first.write(Features[first].note)
             feat_miss_first.write("\n")
         feat_miss_first.close()
         
@@ -380,6 +536,9 @@ def genome_gff(pos_seg, gff, out):
         feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
         for middle in feature_missing_middle :
             feat_miss_middle.write(Features[middle].annot)
+            if Features[middle].type!='gene':
+                feat_miss_middle.write(';')
+                feat_miss_middle.write(Features[middle].note)
             feat_miss_middle.write("\n")
         feat_miss_middle.close()
         
@@ -387,6 +546,9 @@ def genome_gff(pos_seg, gff, out):
         feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
         for last in feature_missing_last :
             feat_miss_last.write(Features[last].annot)
+            if Features[last].type!='gene':
+                feat_miss_last.write(';')
+                feat_miss_last.write(Features[last].note)
             feat_miss_last.write("\n")
         feat_miss_last.close()
     
@@ -394,82 +556,74 @@ def genome_gff(pos_seg, gff, out):
         feat_miss.write("\nfeatures missing entirely on azucena :\n")
         for entier in feature_missing_all :
             feat_miss.write(Features[entier].annot)
+            if Features[entier].type!='gene':
+                feat_miss.write(';')
+                feat_miss.write(Features[entier].note)
             feat_miss.write("\n")
         feat_miss.close()
 
         feat_miss_total=open('features_missing_total.txt','w')
-        feat_miss_total.write("\nfeatures missing a segment on azucena :\n")
+        feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
         for total in feature_missing_total :
             feat_miss_total.write(Features[total].annot)
+            if Features[total].type!='gene':
+                feat_miss_total.write(';')
+                feat_miss_total.write(Features[total].note)
             feat_miss_total.write("\n")
         feat_miss_total.close()
     
         feat_all=open('all_features.txt','w')
         for feat in list_feature:
             feat_all.write(Features[feat].annot)
+            if Features[feat].type!='gene':
+                feat_all.write(';')
+                feat_all.write(Features[feat].note)
             feat_all.write("\n")
         feat_all.close()
 
- 
+        feat_ok=open('features_ok.txt','w')
+        feat_ok.write("\nfeatures entirely present in azucena :\n")
+        for ok in feature_ok :
+            feat_ok.write(Features[ok].annot)
+            if Features[ok].type!='gene':
+                feat_ok.write(';')
+                feat_ok.write(Features[ok].note)
+            feat_ok.write("\n")
+        feat_ok.close()
 
+        # prints the stats for the features (hypothetical/putative rate, by category)
+        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
+        total=int(subprocess.getoutput("wc -l < features_missing_first.txt"))
+        print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/len(feature_missing_first),2),"% hypothetical or putative.")
 
- # writes the gff of azucena using the gff of the graph
-def genome_gff_test(pos_seg, gff, out):
-    
-    print("generation of the genome's gff ")
-    
-    # create a dictionnary with the positions of the segments on azucena
-    seg_ac={}
-    bed=open(pos_seg,'r')
-    lines=bed.readlines()
-    for line in lines:
-        line=line.split()
-        s_id=line[3][1:]
-        ch=line[0]
-        start=line[1]
-        stop=line[2]
-        seg_ac[s_id]=list([ch,start,stop])
-    bed.close()
-    
-    gff=open(gff,'r')
-    file_out = open(out, 'w')
-    
-    lines=gff.readlines()
+        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
+        total=int(subprocess.getoutput("wc -l < features_missing_middle.txt"))
+        print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/len(feature_missing_middle),2),"% hypothetical or putative.")
+
+        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
+        total=int(subprocess.getoutput("wc -l < features_missing_last.txt"))
+        print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/len(feature_missing_last),2),"% hypothetical or putative.")
+
+        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
+        total=int(subprocess.getoutput("wc -l < features_missing.txt"))
+        print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/len(feature_missing_all),2),"% hypothetical or putative.")
+
+        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
+        total=int(subprocess.getoutput("wc -l < features_missing_total.txt"))
+        print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/len(feature_missing_total),2),"% hypothetical or putative.")
+
+        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
+        total=int(subprocess.getoutput("wc -l < features_ok.txt"))
+        print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/len(feature_ok),2),"% hypothetical or putative.")
+
+        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
+        total=int(subprocess.getoutput("wc -l < all_features.txt"))
+        print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/len(list_feature),2),"% hypothetical or putative.")
+
+        command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
+        subprocess.run(command,shell=True)
 
-    # get the list of all the features to transfer
-    list_feature=list()
-    for line in lines:
-        id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
-        if id not in list_feature:
-            list_feature.append(id)
-    
-    
-    #Â for each feature, get list of the segments where it is
-    for feat in list_feature:
-        list_seg=Features[feat].segments_list
-        size_list=len(list_seg)
-        
-        gene_start="empty"
-        seg_start="empty"
-        seg_stop="empty"
-        
-        # prints all gene fragments. 
-        for i in range(0,size_list):
-            
-            if (list_seg[i][1:] in seg_ac) & (1==1):
-                feature=Features[feat]
-                seg_start=list_seg[i][1:]
-                seg_stop=list_seg[i][1:]
-                chr=seg_ac[seg_start][0]
-                start_azu=get_start_2(seg_start,seg_ac,feature.id) 
-                stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
-                out_line=chr+"  "+str(start_azu)+"  "+str(stop_azu)+"   "+seg_start+"   "+seg_stop+"   "+feature.id+"\n"
-                file_out.write(out_line)
-       
 
-    file_out.close()
-    gff.close() 
-    
 
 
 
diff --git a/main.py b/main.py
index 6d3ec68..36c8875 100644
--- a/main.py
+++ b/main.py
@@ -1,7 +1,8 @@
 import Functions as fct
 
 # creates segments and features for the intersect between the graph for chr10 and the gff of IRGSP
-intersect_path='/home/nina/annotpangenome/align_genes/intersect_segments-features_chr10.bed'
+intersect_path='/home/nina/annotpangenome/align_genes/intersect_segments-genes_chr10.bed'
+#intersect_path='/home/nina/annotpangenome/align_genes/intersect_reel_genes_chr10.bed'
 fct.create_seg_feat(intersect_path)
 
 # outputs the gff of the graph for chr10
@@ -11,11 +12,14 @@ fct.graph_gff(output_file)
 
 gff="new_graph_chr10.gff"
 
-if 1==0:
+genome = 'ac'
+
+if genome=='ac': # transfer grom graph to azucena
     pos_seg="seg_coord/AzucenaRS1_chromosome10.bed"
-    out="azucena_chr10_all.gff"
+    #out="azucena_chr10_all.gff"
+    out="azucena_chr10.gff"
 
-if 1==1:
+if genome=='nb': # transfer from graph to nipponbare
     out="nb_chr10_all.gff"
     pos_seg="seg_coord/IRGSP-1_0_Chr10.bed"
 
-- 
GitLab