Merge branch 'branch-source' into setup

9c01d0e3 · nina.marthe_ird.fr · f25c3185 · b3e4655d · 9c01d0e3 · 9c01d0e3
Commit 9c01d0e3 authored 1 year ago by nina.marthe_ird.fr
--- a/ClassSegFeat.py
+++ b/ClassSegFeat.py
+class Segment:
+    def __init__(self,id,feature,chr,start,stop):
+        self.id=id
+        self.features=feature # list of the features on this segment. if the feature is on the + strand, it will be named '+feature_id', else '-feature_id'
+
+        # position on the original genome
+        self.chr=chr
+        self.start=int(start)
+        self.stop=int(stop)
+        
+        self.size=self.stop-self.start+1
+        
+    def __str__(self):
+        return f"id={self.id}, position on the original genome={self.chr}:{self.start}-{self.stop}, size={self.size}, features={self.features}"
+
+
+class Feature:
+    def __init__(self,id,type,chr,start,stop,annot,childs,parent,seg,strand):
+        self.id=id
+        self.type=type
+        
+        # position on the original genome
+        self.chr=chr
+        self.start=int(start)
+        self.stop=int(stop)
+        self.strand=strand
+        
+        self.size=int(stop)-int(start)+1
+
+        self.pos_start=0 # position on the first segment
+        self.pos_stop=0 # position on the last segment
+        
+        self.annot=annot
+        self.childs=childs # liste of child features (exons, cds, etc)
+        self.parent=parent
+        self.segments_list=seg # list of oriented segments on which the feature is (>s1/<s1, depending on the path of the gene in the graph)
+
+        self.note=""
+
+        self.sequence=""
+
+        
+    def __str__(self):
+        if self.parent=="":
+            return f"id={self.id}, type={self.type}, segments={self.segments_list}, position on the original genome={self.chr}:{self.start}-{self.stop}, childs={self.childs}, annotation={self.annot}"
+        else:
+            return f"id={self.id}, type={self.type}, segments={self.segments_list}, position on the original genome={self.chr}:{self.start}-{self.stop}, parent={self.parent}, childs={self.childs}, annotation={self.annot}"
+ 
\ No newline at end of file
--- a/Functions.py
+++ b/Functions.py
--- a/Functions_output.py
+++ b/Functions_output.py
+from Graph_gff import Segments, Features, write_line
+from Functions import *
+
+
+### functions to generate a genome's gff from the graph's gff
+
+
+# functions to get the detailed gff with all the fragments of the features
+# outputs each fragment of the feature in a gff
+def gff_detail(list_seg,feature_id):
+    # loop that goes through all the segments that have the current feature
+    # keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the current part of the fragment, and resets the first segment present.
+    # continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature, etc.
+    # at the end of the loop, prints the last part of the fragment.
+    [feat_start,seg_start,last_seg]=["","",""] # first segment of the feature, first segment of the current part of the feature, last segment in the part of the feature, for loop below
+    for segment in list_seg:
+        if segment[1:] in segments_on_target_genome: 
+            if feat_start=="":
+                feat_start=segment[1:]
+            if seg_start=="": # if we dont have a start, take the current segment for the start of the part of the feature
+                seg_start=segment[1:]
+            #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
+        else:
+            if seg_start!="": # found a stop and we have a start. print the line, reset seg_start, and keep going through the segments to find the next seg_start
+                out_line=create_line_detail(last_seg,feature_id,seg_start,feat_start)
+                write_line(out_line,output_detail_gff,False)
+                seg_start=""
+            #else: if the current segment is not in azucena but there is no start, keep looking for a start
+        last_seg=segment    
+    if last_seg[1:] in segments_on_target_genome:
+        out_line=create_line_detail(list_seg[-1],feature_id,seg_start,feat_start)
+        write_line(out_line,output_detail_gff,False)
+
+
+# functions to get the gff with one line per feature
+# outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :
+def gff_one(first_seg,last_seg,feature_id,list_seg,max_diff):
+
+    if (first_seg!=''): # feature present on the target genome
+        size_on_new_genome=get_feature_stop_on_genome(last_seg,feature_id)-get_feature_start_on_genome(first_seg,feature_id)+1
+        size_diff=abs(size_on_new_genome-Features[feature_id].size)
+        if right_size(size_on_new_genome,max_diff,feature_id): 
+            line=create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg)
+            write_line(line,output_gff_one,False)
+            return size_diff
+        else:
+            return "bad_size"
+    else :
+        return "absent"
+
+
+
+# writes the gff of azucena using the gff of the graph
+def genome_gff(pos_seg, gff, gfa, out_once, out_detail, out_var,target_genome_name):
+    print("generation of the genome's gff ")
+
+    # create variables and open files
+    [once,detail,var,stats]=[True,True,True,False]
+    if var:
+        [paths,seg_seq]=get_segments_sequence_and_paths(gfa)
+        file_out_var = open(out_var, 'w')
+        global output_variations
+        output_variations=[0,"",file_out_var]
+    if detail:
+        file_out_detail = open(out_detail, 'w')
+        global output_detail_gff
+        output_detail_gff=[0,"",file_out_detail]
+    if once:
+        file_out=open(out_once,'w')
+        global output_gff_one
+        output_gff_one=[0,"",file_out]
+        bad_size_features=0
+        absent_features=0
+        diff_size_transfered_features=[0,0] # [count,sum], to get the average
+    if stats:
+        # create objects for stats on how many segments are absent in azucena, their average length, etc
+        feature_missing_segments=[[],[],[],[],[],[],[]] # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
+        # the fist segment of the feature is missing - feature_missing_first
+        # the last segment of the feature is missing - feature_missing_last
+        # at least one middle segment of the feature is missing - feature_missing_middle
+        # the entire feature is missing - feature_missing_all
+        # at least one segment is missing first, last, or middle) - feature_missing_total
+        # no segment is missing, the feature is complete - feature_ok  
+        # total number of features, with missing segments or not - feature_total
+    get_segments_positions_on_genome(pos_seg)
+    list_feature_to_transfer=get_all_features_in_gff(gff) # get the list of all the features to transfer from the gff
+
+    for feat in list_feature_to_transfer:
+
+        # for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
+        list_seg=Features[feat].segments_list
+        first_seg=get_first_seg(list_seg)
+        last_seg=get_first_seg(reversed(list_seg))
+
+        # outputs the feature once in a gff, from the first to the last segment present on the new genome
+        if once:
+            max_diff=2 # maximum difference size (n times bigger of smaller)
+            transfer_stat=gff_one(first_seg,last_seg,feat,list_seg,max_diff) # insertion not considered !!!
+            if transfer_stat=="bad_size":
+                bad_size_features+=1
+            elif transfer_stat=="absent":
+                absent_features+=1
+            else:
+                diff_size_transfered_features[0]+=1
+                diff_size_transfered_features[1]+=transfer_stat
+            write_line("",output_gff_one,True)
+
+        # outputs each fragment of the feature in a gff
+        if detail:
+            gff_detail(list_seg,feat) # insertions !
+            write_line("",output_detail_gff,True)
+
+        # outputs the detail of variations of the feature :
+        if var:
+            print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name)
+            write_line("",output_variations,True)
+
+        if stats==True:
+            stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat)
+
+    # close output_files
+    if detail:
+        file_out_detail.close()
+    if once:
+        file_out.close()
+    if var:
+        file_out_var.close()
+
+
+    # print stats
+    if stats==True:
+        if once:
+            print(len(Features)-(bad_size_features+absent_features),"out of",len(Features),"features are transfered.")
+            print(bad_size_features,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
+            print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
+            print("average length difference of the transfered genes : ",diff_size_transfered_features[1]/diff_size_transfered_features[0])
+        stats_features(feature_missing_segments)
+
+
+
+
+# functions to get the detail of the variations in the features
+
+def print_variations(first_seg,last_seg,feat,paths,seg_seq,target_genome_name):
+
+    if (first_seg!=''): # if the feature is not completly absent        # add the else, output absent features
+        [variation,list_segfeat_nb,list_segfeat_azu]=create_var(feat,first_seg,last_seg,paths,target_genome_name) # removes the strands in the segment lists
+        feature=Features[feat]
+        feat_start=feature.start
+        # loop to go through both paths with i and j
+        [i,j,var_count]=[0,0,0]
+
+        # detect and print variations ignoring the strands
+        while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
+            if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
+
+                if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
+                    if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
+
+                        # if both segments are absent in the other genome, its a substitution
+                        variation.last_seg_in_target=list_segfeat_azu[j][1:]
+                        if (variation.type=='insertion') | (variation.type=='deletion'): # print the current variation before treating the substitution
+                            print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
+                            reset_var(variation)
+                            var_count+=1
+                        if variation.type=='substitution':
+                            continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,0)
+                        else: # initiate substitution
+                            init_new_var(variation,"substitution",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
+                        i+=1;j+=1
+
+                    else: # azu segment not in nb, but nb segment in azu : insertion or continue substitution
+                        if variation.type=='deletion': # print the current variation before treating the insertion
+                            print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
+                            reset_var(variation)
+                            var_count+=1
+                        variation.last_seg_in_target=list_segfeat_azu[j][1:]
+                        if variation.type=='insertion':
+                            continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,0)
+                        elif variation.type=="substitution":
+                            while list_segfeat_azu[j]!=list_segfeat_nb[i]:
+                                continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,2)
+                                j+=1
+                            print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
+                            reset_var(variation)
+                            var_count+=1
+                            j-=1
+                        else: # intiate insertion
+                            init_new_var(variation,"insertion",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
+                        j+=1
+
+                elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion or continue substitution
+                    if variation.type=='insertion': # print the current variation before treating the deletion
+                        print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
+                        reset_var(variation)
+                        var_count+=1
+                    if variation.type=='deletion':
+                        continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,0)
+                    elif variation.type=="substitution":
+                        while list_segfeat_azu[j]!=list_segfeat_nb[i]:
+                            continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,1)
+                            i+=1
+                        print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
+                        reset_var(variation)
+                        var_count+=1
+                        i-=1
+                    else: # intiate deletion
+                        init_new_var(variation,"deletion",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
+                    i+=1
+
+                else : # if both segments are present in the other genome but not at the same position. weird case never found yet
+                    # can be a substitution. check later if its not an inversion
+                    variation.last_seg_in_target=list_segfeat_azu[j][1:]
+                    if (variation.type=='insertion') | (variation.type=='deletion'): # print the current variation before treating the substitution
+                        print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
+                        reset_var(variation)
+                        var_count+=1
+                    if variation.type=='substitution':
+                        continue_var(variation,seg_seq,list_segfeat_nb,list_segfeat_azu,i,j,0)
+                    else: # initiate substitution
+                        init_new_var(variation,"substitution",list_segfeat_nb,list_segfeat_azu,i,j,seg_seq,feature)
+                    i+=1;j+=1
+
+            else: # segment present in both, no variation. print the running indel if there is one
+                if variation.type!='': # print the current variation if there is one
+                    print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
+                    var_count+=1
+                    reset_var(variation)
+                variation.last_seg_in_target=list_segfeat_azu[j][1:]
+                i+=1;j+=1
+
+        if (variation.type!=''): # if there was a current variation when we reached the end, print it
+            print_current_var(variation,feat_start,list_segfeat_azu,feat,i)
+            var_count+=1
+            reset_var(variation)
+
+        if i<=len(list_segfeat_nb)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
+            print_last_deletion(variation,list_segfeat_nb,i,feat_start,feature,seg_seq)
+            var_count+=1
+            reset_var(variation)
+
+        if var_count==0: # if no variation was encountered
+            print_novar(variation)
+
+
+def print_current_var(variation,feat_start,list_segfeat_azu,feat,i):
+    warning=detect_small_inversion(variation)
+    if variation.type=='insertion':
+        [pos_old,pos_new]=get_old_new_pos_insertion(variation,feat_start,list_segfeat_azu,feat)
+        line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}{warning}\n'
+        write_line(line,output_variations,False)
+    elif variation.type=='deletion':
+        [pos_old,pos_new]=get_old_new_pos_deletion(variation,feat_start,list_segfeat_azu,feat)
+        line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}{warning}\n'
+        write_line(line,output_variations,False)
+    elif variation.type=='substitution':
+        [pos_old,pos_new]=get_old_new_pos_substitution(feat_start,variation,list_segfeat_azu,feat)
+
+        size_subs=f'{len(variation.ref)}/{len(variation.alt)}'
+        line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tsubstitution\t{variation.ref}\t{variation.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
+        
+        # print the substitutions of different size as deletion+insertion.
+        #if len(variation.ref) == len(variation.alt): # if the substituion is between two segment of the same size, print it
+        #    size_subs=len(variation.ref)
+        #    line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tsubstitution\t{variation.ref}\t{variation.alt}\t{size_subs}\t{pos_old}\t{pos_new}{warning}\n'
+        #else :
+        #    # if the segments of the substitution have a different size, print deletion then insertion at the same position.
+        #    line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tdeletion\t{variation.ref}\t-\t{len(variation.ref)}\t{pos_old}\t{pos_new}{warning}\n'
+        #    line+=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tinsertion\t-\t{variation.alt}\t{len(variation.alt)}\t{pos_old}\t{pos_new}{warning}\n'
+        write_line(line,output_variations,False)
+
+def detect_small_inversion(variation):
+    [list_ref_common,list_alt_common]=[list(),list()]
+    list_ref_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_ref]
+    list_alt_unstrand=[segment_stranded[1:] for segment_stranded in variation.seg_alt]
+    for seg in variation.seg_ref:
+        if seg[1:] in list_alt_unstrand:
+            list_ref_common.append(seg)
+    for seg in variation.seg_alt:
+        if seg[1:] in list_ref_unstrand:
+            list_alt_common.append(seg)
+    if (len(list_ref_common)>len(list_ref_unstrand)*0.5) & (len(list_alt_common)>len(list_alt_unstrand)*0.5):
+        return f'\t# Suspected inversion within this substitution.'
+    else:
+        return ''
+
+
+
+def print_last_deletion(variation,list_segfeat_nb,i,feat_start,feature,seg_seq):
+    pos_old=int(Segments[list_segfeat_nb[i][1:]].start)-int(feat_start)+1
+    del_sequence=get_sequence_list_seg(list_segfeat_nb,i,feature,seg_seq)
+    length=len(del_sequence)
+    pos_new=str(int(variation.size_new)+1) # the deletion is at the end of the feature on the new genome
+
+    line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tdeletion\t{del_sequence}\t-\t{length}\t{pos_old}\t{pos_new}\n'
+    write_line(line,output_variations,False)
+
+def print_novar(variation):
+    line=f'{variation.feature_id}\t{variation.feature_type}\t{variation.chr}\t{variation.start_new}\t{variation.stop_new}\t{variation.size_new}\t{variation.inversion}\t{variation.size_diff}\tno_var\t-\t-\t-\t-\t-\n'
+    write_line(line,output_variations,False)
+
+
+# not used. 
+
+def get_list_segments_missing(list_seg,segments_on_target_genome):
+    segments_missing=[]
+    for segment in list_seg:
+            if segment[1:] not in segments_on_target_genome:
+                segments_missing.append(Segments[segment[1:]])
+    return segments_missing
+
+# takes a feature and a feature type, returns a list of child features that have the wanted type. currently not used.
+def get_child_list(feature,child_type):
+    if type=="":
+        return feature.childs
+    list_childs=[]
+    for child in feature.childs:
+        if Features[child].type==child_type:
+            list_childs.append(child)
+    return list_childs
+
+
--- a/Graph_gff.py
+++ b/Graph_gff.py
+import ClassSegFeat as Class
+
+# create global variables to manipulate them in the functions below and to pass them to Functions.py
+global Features
+global Segments
+Segments={}
+Features={}
+
+global output_graph_gff
+output_graph_gff=[0,"",""]
+
+global output_graph_gaf
+output_graph_gaf=[0,"",""]
+
+# write in output file
+def write_line(line,output,force):
+    # output [count_line;string_lines;output_file]
+    output[1]+=line
+    output[0]+=1
+    if (output[0]>999) | (force==True):
+        output[2].write(output[1])
+        output[0]=0
+        output[1]=""
+
+
+# functions to create the segments and the features
+
+# create a segment with its first feature
+def init_seg(line,segment_id,feature_stranded):
+    [chr,start,stop]=[line[0],line[1],line[2]]
+    # add the current feature to the list of features that are on the segment
+    feature_list=[feature_stranded]
+    # create the segment, store it in the Segments dict
+    Segments[segment_id]=Class.Segment(segment_id,feature_list,chr,start,stop)
+
+# create a feature with the first segment its on
+def init_feature(line,feature_id,segment_oriented):
+    [type,annot,chr,start,stop,childs,strand]=[line[6],line[12],line[4],line[7],line[8],[],line[10]]
+    parent=get_parent(annot,feature_id)
+    # add the current segment to the list of segments that have the feature
+    segments_list=[segment_oriented]
+    # create the feature, store it in the dict Features
+    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list,strand)
+    set_note(feature_id)
+
+# if there is a parent, returns the id of the parent feature and add the child to the parent
+def get_parent(annot,feature_id):
+    if annot.split(";")[1].split("=")[0]=="Parent":
+    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
+        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
+        add_child(parent,feature_id)
+    elif annot.split(";")[2].split("=")[0]=="Parent":
+    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
+        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
+        add_child(parent,feature_id)
+    else: parent=""
+    return parent
+
+# add a feature to an existing segment
+def add_feature(seg,new_feature_stranded):
+    if new_feature_stranded not in Segments[seg].features:
+        Segments[seg].features.append(new_feature_stranded)
+
+# add a child feature to an existing feature
+def add_child(feat,new_child):
+    if (feat in Features.keys()) & (new_child not in Features[feat].childs): # if the parent feature exists
+        Features[feat].childs.append(new_child)
+
+# add a segment to an existing feature
+def add_seg(feat_id,new_seg_oriented):
+    if new_seg_oriented not in Features[feat_id].segments_list:
+        Features[feat_id].segments_list.append(new_seg_oriented)
+
+# add the position of the feature on its first and last segment
+def add_pos(feature):
+    feature.pos_start=get_feature_start_on_segment(feature.segments_list[0][1:],feature.id)
+    feature.pos_stop=get_feature_stop_on_segment(feature.segments_list[-1][1:],feature.id)
+
+def load_feature_intersect(line,feature_id,seg_oriented):
+    if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
+        init_feature(line, feature_id, seg_oriented)
+    else: # if it exists, add the current segment to the list of segments that have the existing feature
+        add_seg(feature_id,seg_oriented)
+
+def load_segment_intersect(line,segment_id,feature_stranded):
+    if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
+        init_seg(line,segment_id,feature_stranded)
+    else: # if it exists, add the current feature to the list of features on the existing segment
+        add_feature(segment_id,feature_stranded)
+
+
+# create a note for the child features that do not have annotation.     not clean. fct for getting parent ? 
+def set_note(feature_id):
+    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
+    # in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent). 
+    feature=Features[feature_id]
+    if feature.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
+        feature.note=feature.annot.split(';')[-1]
+    else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
+        # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
+        # this is because for example the parent of an exon is the mrna, not the gene itself, so we need to go up until we find the gene.
+        curent=feature.parent
+        annot_found=False
+        while annot_found==False:
+            if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
+                note=Features[curent].annot.split(';')[-1]
+                feature.note=note
+                annot_found=True
+            else: # if we didn't find the gene, we go up to the current feature's parent until we find it
+                curent=Features[Features[curent].parent].id
+
+
+# create all the Segment and Feature objects in the dictionnaries Segments and Features
+def load_intersect(intersect_path):
+    print("loading the intersect file\n")
+    
+    # open the file with the intersect between the segments and the gff
+    file = open(intersect_path, 'r')
+    lines=file.readlines()
+    file.close()
+    
+    for line in lines: # for each line in the intersect file
+        line=line.split()
+        # get the ids for the dictionnaries' keys
+        feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
+        segment_id=line[3][1:]
+        strand=line[10]
+        segment_oriented=line[3]
+        feature_stranded=strand+feature_id
+    
+        load_feature_intersect(line,feature_id,segment_oriented)
+        load_segment_intersect(line,segment_id,feature_stranded)
+
+    # for all the features, add the position of the feature on its first and last segment.
+    # cant do it before because for that i need to have all the segments in the list segments_list for each feature.
+    for feat_id in Features:
+        add_pos(Features[feat_id])
+
+
+
+
+
+
+
+# functions to generate the graph's gff from the segments and features created with create_seg_feat
+
+# get the feature's start position on the segment
+def get_feature_start_on_segment(seg_id,feat_id):
+    s=Segments[seg_id]
+    f=Features[feat_id]
+    if s.start>=f.start:
+        result=1
+    else:
+        result=f.start-s.start+1
+    return result
+
+# get the feature's stop position on the segment
+def get_feature_stop_on_segment(seg_id,feat_id):
+    s=Segments[seg_id]
+    f=Features[feat_id]
+    if s.stop<=f.stop:
+        result=s.size
+    else:
+        result=f.stop-s.start+1
+    return result
+
+# generates the annotation for the gff of the graph, with the rank of the segment in the feature
+def make_annot_graph(feature,segment_id):
+
+    if ">"+segment_id in feature.segments_list:
+        segment_oriented=">"+segment_id
+    else:
+        segment_oriented="<"+segment_id
+        # chat if the segment is present twice in the list, with the two orientation ??
+
+    # get the rank and the total number of ocurrences for the feature
+    rank=str(feature.segments_list.index(segment_oriented)+1)
+    total=str(len(feature.segments_list))
+    # create the annotation with the rank information
+    annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total
+    return annotation
+
+def create_line_graph_gff(feature_stranded,segment_id):
+    [strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]]
+    feature=Features[feature_id]
+
+    annotation=make_annot_graph(feature,segment_id)
+    type=feature.type
+    start=get_feature_start_on_segment(segment_id,feature_id)
+    stop=get_feature_stop_on_segment(segment_id,feature_id)
+    line=segment_id+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
+    return line
+
+# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
+def graph_gff(graph_gff_file):
+    print("generation of the graph's gff")
+    graph_gff_file = open(graph_gff_file, 'w')
+    output_graph_gff[2]=graph_gff_file
+
+    for segment_id in Segments:
+        features_on_seg=Segments[segment_id].features # get the list of the features on the segment
+        for feature_stranded in features_on_seg: # go through all the segment's features, and print the gff line for each
+            line = create_line_graph_gff(feature_stranded,segment_id)
+            write_line(line,output_graph_gff,False)
+    write_line("",output_graph_gff,True)
+
+    graph_gff_file.close()
+
+def graph_gaf(graph_gaf_file,gfa_file):
+    print("generation of the graph's gaf")
+    graph_gaf_file = open(graph_gaf_file, 'w')
+    output_graph_gaf[2]=graph_gaf_file
+
+    seg_len=get_segments_length(gfa_file)
+    for feature_id in Features:
+        feature=Features[feature_id]
+        feature_segments=feature.segments_list
+        feature_path=""
+        path_length=0
+        for segment in feature_segments:
+            feature_path+=segment[0]+segment[2:]
+            path_length+=seg_len[segment[1:]]
+        strand=feature.strand
+        size=feature.size
+        start_on_path=feature.pos_start
+        stop_on_path=start_on_path+size
+
+        line=f'{feature.id}\t{size}\t0\t{size}\t{strand}\t{feature_path}\t{path_length}\t{start_on_path}\t{stop_on_path}\t{size}\t{size}\t255\n'
+        write_line(line,output_graph_gaf,False)
+    write_line("",output_graph_gaf,True)
+
+    graph_gaf_file.close()
+
+
+
+
+
+def get_segments_length(gfa):
+    file_gfa=open(gfa,'r')
+    lines_gfa=file_gfa.readlines()
+    file_gfa.close()
+    seg_len={}
+    segment_encountered=False
+    for line in lines_gfa:
+        line=line.split()
+        if (line[0]=="S"): # get the length of the segment
+            segment_encountered=True
+            seg_id='s'+line[1]
+            seg_len[seg_id]=len(line[2])
+        elif segment_encountered==True: # so we don't have to go through all the L lines. once we found one S, if we find something else than S it stops
+            return seg_len
+    return seg_len
\ No newline at end of file
--- a/add_features_to_graph.py
+++ b/add_features_to_graph.py
+gfa_path="data/graphs/RiceGraphChr10_cactus.gfa"
+gfa_file=open(gfa_path,"r")
+lines_gfa=gfa_file.readlines()
+gfa_file.close()
+
+gaf_path="graph_chr10.gaf"
+gaf_file=open(gaf_path,"r")
+lines_gaf=gaf_file.readlines()
+gaf_file.close()
+
+gff_path="data/genes/nb_genes_chr10.gff3"
+gff_file=open(gff_path,'r')
+lines_gff=gff_file.readlines()
+gff_file.close()
+
+
+# make dict for finding start and stop position on ref for the features
+feature_pos={}
+for line in lines_gff:
+    line=line.split()
+    feature_id=line[8].split(";")[0].split("=")[1]
+    feature_start=line[3]
+    feature_stop=line[4]
+    feature_pos[feature_id]=[feature_start,feature_stop]
+
+# create the new W lines for the features' walks in the graph
+new_W_lines=""
+for line in lines_gaf:
+    line=line.split()
+    feature_id=line[0]
+    if feature_id in feature_pos:
+        feature_start=feature_pos[feature_id][0]
+        feature_stop=feature_pos[feature_id][1]
+    else:
+        feature_start="."
+        feature_stop="."
+    # new_W_lines = W   SampleId    HapIndex    SequenceId  start_on_ref    stop_on_ref walk
+    new_W_lines+=f'W\tIRGSP-1_0\t0\t{feature_id}\t{feature_start}\t{feature_stop}\t{line[5]}\n'
+
+
+
+gfa_file_out=open("data/RiceGraphChr10_cactus_annotated.gfa","w")
+[W_found,W_passed,W_printed]=[False,False,False]
+
+for line in lines_gfa:
+    if line[0]=="W":
+        W_found=True
+    if (line[0]!="W") & W_found:
+        W_passed=True
+    if W_passed & (not W_printed):
+        gfa_file_out.write(new_W_lines)
+        W_printed=True
+    gfa_file_out.write(line)
+gfa_file_out.close()
\ No newline at end of file
--- a/getSegmentsCoordinates.py
+++ b/getSegmentsCoordinates.py
+import subprocess
+import sys
+
+def has_numbers(inputString):
+    return any(char.isdigit() for char in inputString)
+
+def getChrName(chromosome_field):
+    chromosome_id=""
+    if has_numbers(chromosome_field):
+        for char in reversed(chromosome_field): # take the last digits of the field
+            if not char.isdigit():
+                break
+            else:
+                chromosome_id+=char
+        chromosome_id="Chr"+chromosome_id[::-1]
+    else:
+        for char in reversed(chromosome_field): # take the last uppercase chars of the fied
+            if not char.isupper():
+                break
+            else:
+                chromosome_id+=char
+        chromosome_id="Chr"+chromosome_id[::-1]
+    return chromosome_id
+
+if not(len(sys.argv)==2) :
+        print("expected input : gfa file with walks.")
+        print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).")
+        sys.exit(1)
+elif (sys.argv[1]=="-h") :
+    print("expected input : gfa file with walks.")
+    print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).")
+    sys.exit(1)
+
+gfa=sys.argv[1]
+
+# get the lines that start with "S"
+command="grep ^S "+gfa+" > segments.txt"
+subprocess.run(command,shell=True)
+segments = open('segments.txt','r')
+lines=segments.readlines()
+segments.close()
+
+# build a dictionnary with the segment sizes
+segments_size={}
+for line in lines:
+    line=line.split()
+    seg_id='s'+line[1]
+    seg_size=len(line[2])
+    segments_size[seg_id]=seg_size
+
+# get the lines that start with "W"
+command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > walks.txt"
+subprocess.run(command,shell=True)
+walks=open('walks.txt','r')
+lines=walks.readlines()
+walks.close()
+
+# on these lines, get the name of the genome to name the output bed file
+file_names=list()
+for line in lines :
+    line=line.split()
+    name=line[3]
+    path_start=int(line[4])
+    chromosome_field=line[3]
+    chromosome_id=getChrName(chromosome_field)
+
+    file_name=name+'.bed'
+
+    # if we are writing in the file for the first time, overwrite it. else, append it
+    # this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
+    if file_name not in file_names:
+        file_names.append(file_name)
+        out_bed = open(file_name, 'w')
+    else :
+        out_bed = open(file_name, 'a')
+        
+    path=line[6].split(',')
+    position=path_start
+    
+    for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
+        # coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size
+        
+        chr='Chr'+chromosome_id[len(chromosome_id)-2:]
+        
+        seg_start=position
+        seg_name='s'+path[i][1:]
+        seg_stop=position+segments_size[seg_name]-1
+
+        if path[i][0:1]=='>':
+            orientation='+'
+        elif path[i][0:1]=='<':
+            orientation='-'
+        else:
+            orientation='.'
+    
+        out_line=chr+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
+        out_bed.write(out_line)
+        
+        position+=segments_size[seg_name]
+    out_bed.close()
+ 
+command="rm segments.txt && rm walks.txt"
+subprocess.run(command,shell=True)
+
+
+command="if [ 0 -lt $(ls _MINIGRAPH_* 2>/dev/null | wc -w) ]; then mkdir minigraph_segments && mv _MINIGRAPH_* minigraph_segments; fi"
+subprocess.run(command,shell=True)
\ No newline at end of file
--- a/inference.py
+++ b/inference.py
+from Graph_gff import Features,load_intersect
+from Functions import get_segment_sequence,convert_strand
+
+target_genome_name="genome2_chr10"
+pos_seg=target_genome_name+".bed"
+var_file=target_genome_name+"_variations.txt"
+gfa="graph.gfa"
+intersect_path='intersect.bed'
+load_intersect(intersect_path)
+
+
+
+
+# présent dans Fuctions.py mais relou à importer ?
+def get_segments_sequence_and_paths(gfa): # creates two dict with the sequences of the graph's segments, and the paths 
+    file_gfa=open(gfa,'r')
+    lines_gfa=file_gfa.readlines()
+    file_gfa.close()
+    seg_seq={}
+    paths={}
+    for line in lines_gfa:
+        line=line.split()
+        if (line[0]=="S"): # get the sequence of the segment
+            seg_id='s'+line[1]
+            seg_seq[seg_id]=line[2]
+
+        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
+            
+            path=line[6].replace(">",";>")
+            path=path.replace("<",";<").split(';')
+            list_path=[]
+            for segment in path:
+                if segment[0:1]=='>':
+                    list_path.append('+s'+segment[1:])
+                elif segment[0:1]=='<':
+                    list_path.append('-s'+segment[1:])
+ 
+            paths[line[3]]=list_path
+    return [paths,seg_seq]
+
+# présent dans Fuctions.py mais relou à importer ?
+def get_segments_positions_on_genome(pos_seg): # creates a dict with the position of the segments on the target genome
+    bed=open(pos_seg,'r')
+    lines=bed.readlines() # read line by line ?
+    bed.close()
+    segments_on_target_genome={}
+    for line in lines:
+        line=line.split()
+        [seg,chrom,start,stop,strand]=[line[3][1:],line[0],line[1],line[2],line[3][0:1]]
+        segments_on_target_genome[seg]=[chrom,start,stop,strand]
+    return segments_on_target_genome
+
+def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object
+    feature_sequence=""
+    for segment in feature.segments_list:
+        if segment==feature.segments_list[0]:
+            feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:] # revérifier les +/- 1 pour la position, avec de vraies données
+        elif segment==feature.segments_list[-1]:
+            feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop] # revérifier les +/- 1 pour la position, avec de vraies données
+        else:
+            feature_sequence+=get_segment_sequence(seg_seq,segment)
+    feature.sequence=feature_sequence
+
+def get_first_seg(list_seg,segments_on_target_genome): # get the first segment of the list that is in the target genome
+    first_seg_found=''
+    for segment in list_seg:
+        if segment[1:] in segments_on_target_genome:
+            first_seg_found=segment[1:]
+            break
+    return first_seg_found
+
+def get_feature_path(paths,first_seg,last_seg):# find the feature's path in the target genome
+    first_strand=convert_strand(segments_on_target_genome[first_seg][3])
+    first_seg_stranded=first_strand+first_seg
+    last_strand=convert_strand(segments_on_target_genome[last_seg][3])
+    last_seg_stranded=last_strand+last_seg
+    index_first_seg=int(paths[target_genome_name].index(first_seg_stranded))
+    index_last_seg=int(paths[target_genome_name].index(last_seg_stranded))
+    first_index=min(index_first_seg,index_last_seg)
+    last_index=max(index_last_seg,index_first_seg)
+    list_segfeat_azu=paths[target_genome_name][first_index:last_index+1]
+    list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
+    return list_segfeat_azu_corrected
+
+def get_rna(dna_sequence):
+    return dna_sequence.replace("T","U")
+
+def get_aa(rna_codon): # gives the aa coded by the rna codon
+    match rna_codon[0:2]:
+        case "UU":
+            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
+                return "Phe"
+            else:
+                return "Leu"
+        case "UC":
+            return "Ser"
+        case "UA":
+            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
+                return "Tyr"
+            else:
+                return "*"
+        case "UG":
+            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
+                return "Cys"
+            elif rna_codon[2]=="A":
+                return "*"
+            else:
+                return "Trp"
+        case "CU":
+            return "Leu"
+        case "CC":
+            return "Pro"
+        case "CA":
+            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
+                return "His"
+            else:
+                return "Gln"
+        case "CG":
+            return "Arg"
+        case "AU":
+            if rna_codon[2]=="G":
+                return "Met"
+            else:
+                return "Ile"
+        case "AC":
+            return "Thr"
+        case "AA":
+            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
+                return "Asn"
+            else:
+                return "Lys"
+        case "AG":
+            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
+                return "Ser"
+            else:
+                return "Arg"
+        case "GU":
+            return "Val"
+        case "GC":
+            return "Ala"
+        case "GA":
+            if (rna_codon[2]=="U") | (rna_codon[2]=="C"):
+                return "Asp"
+            else:
+                return "Glu"
+        case "GG":
+            return "Gly"
+
+from textwrap import wrap
+def cut_codon(sequence):
+    return wrap(sequence,3)
+
+def traduction(sequence_arn): # translate rna
+    list_codons=cut_codon(sequence_arn)
+    prot=list()
+    for codon in list_codons:
+        if len(codon)==3:
+            prot.append(get_aa(codon))
+        else:
+            print("attempt to get the amino acid for an incomplete codon")
+    return prot
+
+def get_sequence_on_genome(feature,segments_on_target_genome): # returns the sequence of the feature on the target genome
+    list_seg=Features[feature].segments_list
+    first_seg=get_first_seg(list_seg,segments_on_target_genome)
+    last_seg=get_first_seg(reversed(list_seg),segments_on_target_genome)
+    path_on_target=get_feature_path(paths,first_seg,last_seg)
+
+    new_sequence=""
+    for segment in path_on_target:
+        if segment==cds.segments_list[0]:
+            new_sequence+=get_segment_sequence(seg_seq,segment)[cds.pos_start-2:]
+        elif segment==cds.segments_list[-1]:
+            new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop]
+        else:
+            new_sequence+=get_segment_sequence(seg_seq,segment)
+    return new_sequence
+
+def get_cds_variations(var_file): # creates dict : cds_id -> list of variations from the var file 
+    var=open(var_file,'r')
+    lines=var.readlines()
+    var.close()
+
+    # dict cds-var
+    cds_var={}
+    for line in lines:
+        line=line.split()
+        if line[1]=="CDS":
+            cds_id=line[0].replace('.','_').replace(':','_')
+            if cds_id not in cds_var.keys():
+                cds_var[cds_id]=list()
+            cds_var[cds_id].append(line)
+    return cds_var
+
+import re
+def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one 
+    print("\nlooking for a new start codon:")
+    frame_shift=0
+    # chercher codon start en amont du cds, dans le mrna
+    seq_parent=get_sequence_on_genome(cds.parent,segments_on_target_genome)
+    seq_cds=get_sequence_on_genome(cds_id,segments_on_target_genome)
+    sequence_amont=seq_parent[0:seq_parent.rfind(seq_cds)] # get the mrna sequence before the last occurence of the cds sequence
+    print("sequence before the CDS in the mRNA:",sequence_amont)
+    if "ATG" in sequence_amont:
+        start_pos_list=[m.start() for m in re.finditer('(?=ATG)', sequence_amont)]
+        stop_pos_list=[m.start() for m in re.finditer('(?=TAG|TAA|TGA)', sequence_amont)]
+        print("start codons position:",start_pos_list,"stop codons position:",stop_pos_list,"before the CDS") # positions (overlapping) où on trouve un atg.
+        # vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
+        print("checking if there is a stop codon after the start codons in the same reading frame:")
+        for start_pos in start_pos_list:
+            start_pos_frame=start_pos%3
+            n=len(sequence_amont)-start_pos+1
+            if True not in ( (stop_pos%3==start_pos_frame) & (stop_pos>start_pos) for stop_pos in stop_pos_list) :
+                #print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds")
+                # calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !
+                frame_shift=(frame_shift+n)%3 # vérifier le frame shift !!
+                print("the start codon at the position",start_pos,",",n,"bases before the CDS, doesn't have a stop codon after in the same reading frame")
+            else:
+                print("the start codon at the position",start_pos,",",n,"bases before the CDS, has a stop codon after in the same reading frame")
+
+    # chercher codon start en aval, dans le cds
+    if "ATG" in seq_cds:
+        start_pos_list=[m.start() for m in re.finditer('(?=ATG)', seq_cds)]
+        print("start codon candidate found later in the CDS, at the base",start_pos_list[0]) # print seulement le premier
+    print("\n")
+    return frame_shift
+
+def print_variation_change(deleted_sequence,inserted_sequence): # print the consequence of the variation represented by the insertion and deletion
+    stop=False
+    deleted_aa=traduction(get_rna(deleted_sequence))
+    inserted_aa=traduction(get_rna(inserted_sequence))
+
+    if (len(deleted_aa)!=0) & (len(inserted_aa)!=0):
+        if deleted_aa!=inserted_aa:
+            print("consequence :",",".join(deleted_aa),"changes into",",".join(inserted_aa))
+        else:
+            print("consequence : synonymous variation in",",".join(deleted_aa))
+    elif len(inserted_aa)!=0:
+        print("consequence : insertion of",",".join(inserted_aa))
+    else:
+        print("consequence : deletion of",",".join(deleted_aa))
+    if ("*" in inserted_aa):
+        print("stop codon apparition in the cds of the target genome")
+        stop=True
+    return stop
+
+
+[paths,seg_seq]=get_segments_sequence_and_paths(gfa)
+segments_on_target_genome=get_segments_positions_on_genome(pos_seg)
+cds_var=get_cds_variations(var_file)
+for feature in Features.values(): # add the sequence of all features
+    add_feature_sequence(feature,seg_seq)
+
+
+# analysing the variations for all the cds :
+for cds_id in cds_var.keys():
+    cds=Features[cds_id]
+    print("analysis of the variations in the CDS",cds_id,":\n")
+    frame_shift=0
+    for index, var in enumerate(cds_var[cds_id]): # for each variation in the current cds :
+        type_var=var[8]
+        if type_var!="no_var": # if there is a variation
+            posVar=[int(var[12]),int(var[13])]
+            sequence_target=get_sequence_on_genome(cds_id,segments_on_target_genome)
+
+            if type_var=="insertion":
+                length_ref=0
+            else:
+                length_ref=len(var[9])
+            if type_var=="deletion":
+                length_alt=0
+            else:
+                length_alt=len(var[10])
+            
+            print("variation",index, ":")
+            
+            if posVar[0]<=3:
+                print("variation of the start codon, mRNA most likely wont be translated")
+                #findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon
+                break
+
+            if abs(length_alt-length_ref)%3 == 0: # size diff 3k -> no frame shift.
+
+                if (posVar[0])%3==0: #size diff 3k, position 3k
+                    print("variation between two codons not causing a frameshift")
+                    if type_var=="insertion":
+                        print("insertion of",var[10])
+                    elif type_var=="deletion":
+                        print("deletion of",var[9])
+                    else:
+                        print("substitution of",var[9],"by",var[10])
+
+                    len_fragment_after=(3-length_ref)%3
+                    deleted_sequence=cds.sequence[posVar[0]:posVar[0]+length_ref+len_fragment_after]
+                    inserted_sequence=sequence_target[posVar[1]:posVar[1]+length_alt+len_fragment_after]
+                    stop=print_variation_change(deleted_sequence,inserted_sequence)
+                    if stop:
+                        break
+
+                else: # size diff 3k, position !=3k
+                    print("variation in the middle of a codon not causing a frameshift")
+                    if type_var=="insertion":
+                        print("insertion of",var[10])
+                    elif type_var=="deletion":
+                        print("deletion of",var[9])
+                    else:
+                        print("substitution of",var[9],"by",var[10])
+
+                    len_fragment_before=(posVar[0])%3
+                    len_fragment_after=(3-(len_fragment_before+length_ref))%3
+
+                    total_ins=sequence_target[posVar[1]-len_fragment_before:posVar[1]+length_alt+len_fragment_after]
+                    total_del=cds.sequence[posVar[0]-len_fragment_before:posVar[0]+length_ref+len_fragment_after]
+
+                    stop=print_variation_change(total_del,total_ins)
+                    if stop:
+                        break
+                # possible that it prints too many variations : for ex if we have a snp on the first and the last base of a codon, 
+                # while printing the effect of the first snp we aso use the second one.
+
+            else: # size diff !=3k
+                print("frameshift variation")
+                old_frameshift=frame_shift
+                frame_shift=(frame_shift+length_ref-length_alt)%3
+                # frameshift=0 -> reading frame recovered. may need to get a base before.
+                # frameshift=1 -> frame shift of 1 base to the right
+                # frameshift=2 -> frame shift of 2 bases to the right
+
+                if type_var=="insertion":
+                    print("insertion of",var[10])
+                elif type_var=="deletion":
+                    print("deletion of",var[9])
+                else:
+                    print("substitution of",var[9],"by",var[10])
+
+                len_fragment_before_del=(posVar[0])%3
+                len_fragment_before_ins=(posVar[1])%3
+
+                if frame_shift==0:
+                    # print only the local change.
+                    len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3
+                    len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3
+                    total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins]
+                    total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del]
+                    stop=print_variation_change(total_del,total_ins)
+                    print("recovery of the original reading frame")
+                    if stop:
+                        break
+
+                else:
+                    # print changes from local var to next var. at the next var, we will see if the reading frame is recovered.
+                    print("creating frame shift of",frame_shift,"base(s) to the right.")
+                    if old_frameshift==0:
+                        print("loss of the original reading frame")
+                    if index==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds.
+                        total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:]
+                        total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:]
+                        stop=print_variation_change(total_total_del,total_total_ins)
+                        if stop:
+                            break
+                    else: 
+                        nextVar=cds_var[cds_id][index+1]
+                        posNextVar=[int(nextVar[12]),int(nextVar[13])]
+
+                        if nextVar[8]=="insertion":
+                            length_ref_nextvar=0
+                        else:
+                            length_ref_nextvar:len(nextVar[9])
+                        if nextVar[8]=="deletion":
+                            length_alt_nextvar=0
+                        else:
+                            length_alt_nextvar=len(nextVar[10])
+
+                        len_fragment_before_del_nextvar=(posNextVar[0])%3
+                        len_fragment_before_ins_nextvar=(posNextVar[1])%3
+                        total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar]
+                        total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar]
+                        stop=print_variation_change(total_total_del,total_total_ins)
+                        if stop:
+                            break
+
+
+
+        print("\n")
\ No newline at end of file
--- a/main.py
+++ b/main.py
+#!/home/nina/annotpangenome/.venv/bin/python
+
+# created by Nina Marthe 2023 - nina.marthe@ird.fr
+# licensed under MIT
+
+from Graph_gff import *
+from Functions_output import *
+#from inference import *
+
+run="command_line"
+
+if run=="command_line":
+
+    import sys
+    if not(len(sys.argv)>=4) :# intersect, gfa, pos_seg (target_genome_name)
+            print("expected input : intersect, gfa file  with walks, bed file with positions of the segments on the target genome")
+            #print("output : graph gff, graph gaf, target genome gff*2+variations")
+            sys.exit(1)
+    elif (sys.argv[1]=="-h") :
+        print("expected input : intersect, gfa file  with walks, bed file with positions of the segments on the target genome")
+        print("output : graph gff, graph gaf, target genome gff*2+variations")
+        sys.exit(1)
+
+    intersect=sys.argv[1]
+    gfa=sys.argv[2]
+    pos_seg=sys.argv[3]
+
+    out_gff=gfa.split("/")[-1].split(".")[0:-1][0]+".gff"
+    out_gaf=gfa.split("/")[-1].split(".")[0:-1][0]+".gaf"
+    out_once=pos_seg.split("/")[-1].split(".")[0:-1][0]+".gff"
+    out_detail=pos_seg.split("/")[-1].split(".")[0:-1][0]+"_detail.gff"
+    out_var=pos_seg.split("/")[-1].split(".")[0:-1][0]+"_variations.txt"
+    if len(sys.argv)==5:
+        target_genome_name=sys.argv[5]
+    else:
+        target_genome_name=pos_seg.split("/")[-1].split(".")[0:-1][0]
+
+    load_intersect(intersect)
+
+    # outputs the gff and gaf of the graph for chr10
+    graph_gff(out_gff)
+    graph_gaf(out_gaf,gfa)
+
+    # outputs the gff of a genome for the chr10
+    genome_gff(pos_seg,out_gff,gfa,out_once,out_detail,out_var,target_genome_name)
+
+
+
+
+if run=="test":
+    intersect_path='intersect.bed'
+    load_intersect(intersect_path)
+
+    # outputs the gff of the graph for chr10
+    output_gff='graph.gff'
+    output_gaf='graph.gaf'
+    gfa="graph.gfa"
+    graph_gff(output_gff)
+    graph_gaf(output_gaf,gfa)
+
+    out_once="target_genome.gff"
+    out_var="variations.txt"
+    out_detail="target_genome_detail.gff"
+    pos_seg="genome2_chr10.bed"
+
+    gff=output_gff
+
+    target_genome_name="genome2_chr10"
+
+    # outputs the gff of a genome for the chr10
+    genome_gff(pos_seg,gff,gfa,out_once,out_detail,out_var,target_genome_name)
+
+if run=="reel":
+
+    # creates segments and features for the intersect between the graph for chr10 and the gff of IRGSP
+    intersect_path='/home/nina/annotpangenome/align_genes/intersect_segments-genes_chr10.bed'
+    load_intersect(intersect_path)
+
+    # outputs the gff of the graph for chr10
+    output_gff='graph_chr10.gff'
+    output_gaf='graph_chr10.gaf'
+    gfa="data/graphs/RiceGraphChr10_cactus.gfa"
+    graph_gff(output_gff)
+    graph_gaf(output_gaf,gfa)
+
+
+    genome = 'ac'
+    if genome=='ac': # transfer from graph to azucena
+        pos_seg="seg_coord/AzucenaRS1_chromosome10.bed"
+        out_once="azucena_chr10.gff"
+        out_detail="azucena_detail_chr10.gff"
+        out_var="variations_chr10.gff"
+
+    if genome=='nb': # transfer from graph to nipponbare
+        pos_seg="seg_coord/IRGSP-1_0_Chr10.bed"
+        out_once="nb_chr10_all.gff"
+        out_detail="nb_chr10_all_detail.gff"
+        out_var="variations_chr10.gff"
+
+    gff=output_gff
+    target_genome_name="CM020642.1_Azucena_chromosome10"
+
+    # outputs the gff of a genome for the chr10
+    genome_gff(pos_seg,gff,gfa,out_once,out_detail,out_var,target_genome_name)
+
+