Functions.py

from Graph_gff import Segments, Features, get_feature_start_on_segment, get_feature_stop_on_segment,invert_seg,search_segment
global segments_on_target_genome
segments_on_target_genome={}


# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_feature_start_on_target_genome(start_seg,feat_id):
    seg_start_pos=segments_on_target_genome[start_seg][1]
    feat_start_pos=get_feature_start_on_segment(start_seg,feat_id)
    
    return seg_start_pos+feat_start_pos-1

# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_feature_stop_on_target_genome(stop_seg,feat_id):
    seg_start_pos=segments_on_target_genome[stop_seg][1]
    feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id)
    
    return seg_start_pos+feat_stop_pos-1


# get the start position of the features on the linear genome for inverted features
def get_feature_start_on_target_genome_inv(start_seg,feat_id):
    seg_end_pos=segments_on_target_genome[start_seg][2]
    feat_start_pos=get_feature_start_on_segment(start_seg,feat_id)

    return seg_end_pos-feat_start_pos+1

# get the stop position of the features on the linear genome for inverted features
def get_feature_stop_on_target_genome_inv(stop_seg,feat_id):
    seg_end_pos=segments_on_target_genome[stop_seg][2]
    feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id)
    
    return seg_end_pos-feat_stop_pos+1


# functions to get the gff with one line per feature
def right_size(size,max_diff,feat):
    if max_diff==0:
        return True
    return not ((size>Features[feat].size*max_diff) | (size<Features[feat].size/max_diff)) 

def create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion):
    [chr,strand,feature]=[segments_on_target_genome[first_seg][0],Features[feature_id].strand,Features[feature_id]]

    var_count=count_variations(feature_id)
    annotation=f'{feature.annot};Size_diff={size_diff};Nb_variants={var_count}'

    if inversion:
        start=get_feature_start_on_target_genome_inv(last_seg,feature_id)
        stop=get_feature_stop_on_target_genome_inv(first_seg,feature_id)
        strand=invert_strand(strand)
    else:
        start=get_feature_start_on_target_genome(first_seg,feature_id)
        stop=get_feature_stop_on_target_genome(last_seg,feature_id)

    output_line=f'{chr}\tGrAnnoT\t{feature.type}\t{start}\t{stop}\t.\t{strand}\t.\t{annotation}\n'
    return output_line


# functions to get the alignment for the transfered genes

# create alignment for a feature
def segment_aln(type,seg_seq,seg_a,seg_b,first,feature_id,last):

    match type:
        case "identity":
            if first:
                feature=Features[feature_id]
                seq_aln=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
            elif last:
                feature=Features[feature_id]
                seq_aln=get_segment_sequence(seg_seq,seg_a)[:feature.pos_stop]
            else:
                seq_aln=get_segment_sequence(seg_seq,seg_a)
            line_a=seq_aln
            line_b=seq_aln
            len_aln=len(seq_aln)
            line_c=len_aln*"*"
        case "substitution":
            seq_aln_a=get_segment_sequence(seg_seq,seg_a)
            seq_aln_b=get_segment_sequence(seg_seq,seg_b)
            len_a=len(seq_aln_a)
            len_b=len(seq_aln_b)
            if len_a>len_b:
                diff_len=len_a-len_b
                line_a=seq_aln_a
                line_b=seq_aln_b+diff_len*"-"
                line_c=len_a*" "
            else:
                diff_len=len_b-len_a
                line_a=seq_aln_a+diff_len*"-"
                line_b=seq_aln_b
                line_c=len_b*" "
        case "insertion":
            seq_aln_b=get_segment_sequence(seg_seq,seg_b)
            len_b=len(seq_aln_b)
            line_a=len_b*"-"
            line_b=seq_aln_b
            line_c=len_b*" "
        case "deletion":
            if first:
                feature=Features[feature_id]
                seq_aln_a=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
            else:
                seq_aln_a=get_segment_sequence(seg_seq,seg_a)
            len_a=len(seq_aln_a)
            line_a=seq_aln_a
            line_b=len_a*"-"
            line_c=len_a*" "
        case "end_deletion":
            seq_aln_a=""
            for segment in seg_a[:-1]:
                seq_aln_a+=get_segment_sequence(seg_seq,segment)
            feature=Features[feature_id]
            seq_aln_a+=get_segment_sequence(seg_seq,seg_a[-1])[0:feature.pos_stop] # for the last segment, only take the part that the feature is on
            len_a=len(seq_aln_a)
            line_a=seq_aln_a
            line_b=len_a*"-"
            line_c=len_a*" "

    return [line_a,line_b,line_c,False] # check the orientation of the segment later


def parse_aln_lines(line_a,line_b,line_c,feature_id):
    if (len(line_a)!=len(line_b)) | (len(line_b)!=len(line_c)):
        print("line lengths differ in alignment")
    len_to_parse=len(line_a)
    len_parsed=0
    aln_line=""
    nb_res_a=0
    nb_res_b=0

    while len_parsed<len_to_parse:
        len_header=len(feature_id)+11
        headers=[feature_id+"_source    ",feature_id+"_target    ",len_header*" "]

        add_a=line_a[len_parsed:len_parsed+60]
        add_b=line_b[len_parsed:len_parsed+60]
        add_c=line_c[len_parsed:len_parsed+60]
        nb_res_a+=len(add_a)-add_a.count("-")
        nb_res_b+=len(add_b)-add_b.count("-")
        aln_line+=f'{headers[0]}{add_a}    {nb_res_a}\n'
        aln_line+=f'{headers[1]}{add_b}    {nb_res_b}\n'
        aln_line+=f'{headers[2]}{add_c}\n\n'
        len_parsed+=60
    aln_line+="\n"
    
    return aln_line


def create_line_aln(feature_path_source_genome,feature_path_target_genome,seg_seq,feature_id):
    line_a=""
    line_b=""
    line_c=""
    [i,j]=[0,0]
    first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
    last=False # same for the last part of the feature

    while (i<len(feature_path_source_genome)) & (j<len(feature_path_target_genome)):
        if i==len(feature_path_source_genome)-1:
            last=True
        if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths
            if feature_path_target_genome[j] not in feature_path_source_genome: # if the segment in target genome is absent in source genome
                if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome : substitution
                    [add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
                    line_a+=add_a;line_b+=add_b;line_c+=add_c
                    i+=1;j+=1
                else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
                    [add_a,add_b,add_c,first]=segment_aln("insertion",seg_seq,"",feature_path_target_genome[j],first,feature_id,last)
                    line_a+=add_a;line_b+=add_b;line_c+=add_c
                    j+=1
            elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
                [add_a,add_b,add_c,first]=segment_aln("deletion",seg_seq,feature_path_source_genome[i],"",first,feature_id,last)
                line_a+=add_a;line_b+=add_b;line_c+=add_c
                i+=1
            else : # if both segments are present in the other genome but not at the same position. weird case never found yet
                [add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
                line_a+=add_a;line_b+=add_b;line_c+=add_c
                i+=1;j+=1

        else: # segment present in both, no variation. 
            [add_a,add_b,add_c,first]=segment_aln("identity",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
            line_a+=add_a;line_b+=add_b;line_c+=add_c
            i+=1;j+=1

    if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
        [add_a,add_b,add_c,first]=segment_aln("end_deletion",seg_seq,feature_path_source_genome[i:],"",first,feature_id,last)
        line_a+=add_a;line_b+=add_b;line_c+=add_c

    return parse_aln_lines(line_a,line_b,line_c,feature_id)


# functions to output the stats on the transfer

def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feature_id):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
    feature_missing_segments[5].append(feature_id)

    if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely
        feature_missing_segments[3].append(feature_id)
    elif first_seg != list_seg[0]: # the first segment is missing 
        feature_missing_segments[0].append(feature_id)
    elif last_seg!=list_seg[-1]: # the last segment is missing
        feature_missing_segments[2].append(feature_id)

    # go through all the segments, check if some are missing in the middle of the feature
    elif (len(list_seg)!=1) & (feature_id not in feature_missing_segments[3]): # to access the second to last element
        for segment in list_seg[1-(len(list_seg)-2)]:
            if segment not in segments_on_target_genome:
                feature_missing_segments[1].append(feature_id)
                break

    # go through the segments, to see if one is missing anywhere on the feature
    for segment in list_seg:
        if segment not in segments_on_target_genome:
            if feature_id not in feature_missing_segments[4]:
                feature_missing_segments[4].append(feature_id)
                break

    # if the feature doesnt have a missing segment, it is complete.         ADD THE PATH CHECK FOR INSERTIONS !!
    if feature_id not in feature_missing_segments[4]:
        feature_missing_segments[6].append(feature_id)

def get_annot_features(list_features):
    list_annot_features=[]
    for feature in list_features:
        list_annot_features.append(Features[feature].note)
    return list_annot_features

def count_hypput_total(list_annot_first):
    total=len(list_annot_first)
    count_hypput=0
    for annot in list_annot_first:
        if ("hypothetical" in annot) | ("putative" in annot):
            count_hypput+=1
    return [count_hypput,total]

# print stats on the transfer : number of feature that have segments in different positions missing. 
def stats_features(feature_missing_segments):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
    list_annot_first=get_annot_features(feature_missing_segments[0])
    [hyp_put,total]=count_hypput_total(list_annot_first)
    print("\nthe first segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_middle=get_annot_features(feature_missing_segments[1])
    [hyp_put,total]=count_hypput_total(list_annot_middle)
    print("a middle segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_last=get_annot_features(feature_missing_segments[2])
    [hyp_put,total]=count_hypput_total(list_annot_last)
    print("the last segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_all=get_annot_features(feature_missing_segments[3])
    [hyp_put,total]=count_hypput_total(list_annot_all)
    print(total,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_total=get_annot_features(feature_missing_segments[4])
    [hyp_put,total]=count_hypput_total(list_annot_total)
    print("there is at least one segment missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_ok=get_annot_features(feature_missing_segments[6])
    [hyp_put,total]=count_hypput_total(list_annot_ok)
    print(total ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    list_annot_features=get_annot_features(feature_missing_segments[5])
    [hyp_put,total]=count_hypput_total(list_annot_features)
    print("there is", total,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")


# functions to generate the different gffs

def get_segments_positions_on_genome(pos_seg):
    bed=open(pos_seg,'r')
    lines=bed.readlines() # read line by line ?
    bed.close()
    seg_count=0
    file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
    for line in lines:
        line=line.split()
        [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
        # if seg in segments_on_target_genome:
        #     # seg=seg+"_bis" ???
        #     print("seg already present on target genome")
        #     print(strand,seg,segments_on_target_genome[seg][3])
        segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
        seg_count+=1

# look for the segment on either strand of the target genome
def search_seg_on_target_genome(segment):
    inverted_segment=invert_seg(segment)
    if segment in segments_on_target_genome:
        #if inverted_segment in segments_on_target_genome:
        #    print(segment," found in both orientations")
        return segment
    elif inverted_segment in segments_on_target_genome:
        #print("inverted seg found *****")
        return inverted_segment
    else:
        return False

def get_segments_sequence(segments_file,segments_list):
    file_segments=open(segments_file,'r')
    lines_segments=file_segments.readlines()
    file_segments.close()
    seg_seq={}
    for line in lines_segments:
        line=line.split()
        seg_id='s'+line[1]
        if seg_id in segments_list:
            seg_seq[seg_id]=line[2]
    return seg_seq

def get_paths(walks_file,target_genome):
    file_walks=open(walks_file,'r')
    lines_walks=file_walks.readlines()
    file_walks.close()
    paths={}
    for line in lines_walks:
        line=line.split()
        seq_name=line[1]+"_"+line[3]
        if target_genome in seq_name: # get the walk of the genome
            path=line[6].split(',')[1:]
            list_segments=[]
            for segment in path:
                if segment[0:1]=='>':
                    list_segments.append('>s'+segment[1:])
                elif segment[0:1]=='<':
                    list_segments.append('<s'+segment[1:])
            paths[seq_name]=list_segments
    return paths

def get_first_seg(list_seg): # get the first segment of the list that is in the target genome
    first_seg_found=''
    for segment in list_seg:
        seg_found=search_seg_on_target_genome(segment)
        if seg_found:
            first_seg_found=seg_found
            break
    return first_seg_found


# functions to get the detail of the variations in the features

# find on what target path the segments of the feature are (ie what chromosome/contig)
def find_feature_target_path(first_seg,last_seg,target_genome_paths):
    feature="not_found"
    first_seg=search_seg_on_target_genome(first_seg)
    last_seg=search_seg_on_target_genome(last_seg)
    path_first_seg=segments_on_target_genome[first_seg][5]
    path_last_seg=segments_on_target_genome[last_seg][5]
    if path_first_seg==path_last_seg:
        for path in target_genome_paths:
            if path in path_first_seg:
                return path
    else:
        return "frag"
    return feature

# add the path of the feature on the target genome in the object Feature
def add_target_genome_path(feature_id,target_genome_paths):
    feature=Features[feature_id]
    list_seg=feature.segments_list_source
    first_seg=get_first_seg(list_seg)
    last_seg=get_first_seg(reversed(list_seg))
    feature_path=[]
    if first_seg!='':
        path=find_feature_target_path(first_seg,last_seg,target_genome_paths)
        if path=="frag":
            print(f'feature {feature_id} fragmented')
        elif path!="not_found":
            feature_path=get_feature_path(target_genome_paths[path],first_seg,last_seg)
    feature.segments_list_target=feature_path

# find the feature's path in target genome
def get_feature_path(target_genome_path,first_seg,last_seg):
    first_seg=search_seg_on_target_genome(first_seg)
    last_seg=search_seg_on_target_genome(last_seg)
    first_seg_index=segments_on_target_genome[first_seg][4]
    last_seg_index=segments_on_target_genome[last_seg][4]
    first_index=min(first_seg_index,last_seg_index)
    last_index=max(first_seg_index,last_seg_index)
    feature_path_target_genome=target_genome_path[first_index:last_index+1]
    return feature_path_target_genome

def count_variations(feature_id):
    feature=Features[feature_id]
    target_list=feature.segments_list_target
    if len(target_list)!=0:
        source_list=feature.segments_list_source
        inversion=detect_feature_inversion(source_list,target_list)
        if inversion:
            target_list=invert_segment_list(target_list)
        target_dict=dict.fromkeys(target_list,"")
        source_dict=dict.fromkeys(source_list,"") # convert list into dict to search segments in dict quicker.
        var_count=0
        for segment in source_dict:
            if segment not in target_dict:
                var_count+=1
        for segment in target_dict:
            if segment not in source_dict:
                var_count+=1
        # this counts the substitutions twice, as insertion+deletion.
    return var_count

def invert_strand(strand):
    match strand:
        case "+":
            return "-"
        case "-":
            return "+"
        case ">":
            return "<"
        case "<":
            return ">"
        case default:
            return ""

def get_sequence_list_seg(list_seg,i,feature,seg_seq):
    del_sequence=""
    for k in range(i,len(list_seg)):
        if k==len(list_seg)-1:
            del_sequence+=get_segment_sequence(seg_seq,list_seg[k])[0:feature.pos_stop]
        else:
            del_sequence+=get_segment_sequence(seg_seq,list_seg[k])
    return del_sequence

def get_segment_sequence(seg_seq,segment):
    if segment[0]==">":
        return seg_seq[segment[1:]]
    else:
        return reverse_complement(seg_seq[segment[1:]])

def reverse_complement(sequence):
    sequence_rc=""
    for char in sequence:
        sequence_rc+=complement(char)
    return sequence_rc[::-1]

def complement(nucl):
    match nucl:
        case "A":
            return "T"
        case "C":
            return "G"
        case "G":
            return "C"
        case "T":
            return "A"
    return nucl

class Variation:
    def __init__(self,feature_id,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
        self.feature_id=feature_id
        self.feature_type=feature_type
        self.chr=chr
        self.start_new=start_new
        self.stop_new=stop_new
        self.inversion=inversion
        self.size_diff=size_diff
        self.size_new=size_new
        self.type=''
        self.last_seg_in_target=''
        self.seg_ref=list()
        self.seg_alt=list()
        
    # add fct to write line.

    #def __str__(self):
    #    return f"id={self.id}, position on the original genome={self.chr}:{self.start}-{self.stop}, size={self.size}, features={self.features}"

# initiate a Variation object with the information on the feature it is on
def create_var(feature_id,first_seg,last_seg):
    feature=Features[feature_id]
    # get feature paths on the original genome and on the target genome
    feature_path_target_genome=feature.segments_list_target
    feature_path_source_genome=feature.segments_list_source
    inversion=detect_feature_inversion(feature_path_source_genome,feature_path_target_genome)

    if inversion:
        feature_path_target_genome=invert_segment_list(feature_path_target_genome)
        start_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id)
        stop_new_genome=get_feature_stop_on_target_genome_inv(first_seg,feature_id)
        size_new_genome=start_new_genome-stop_new_genome+1
    else:
        start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id)
        stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id)
        size_new_genome=stop_new_genome-start_new_genome+1
    size_diff=str(size_new_genome-feature.size)
    sequence_name=segments_on_target_genome[first_seg][0]

    variation=Variation(feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
    return(variation,feature_path_source_genome,feature_path_target_genome)

# reset the informations of the variation, but keep the information about the feature
def reset_var(variation):
    variation.type='' # make type enumerate
    variation.size_var=0
    variation.start_var=''
    variation.start_var_index=0
    variation.ref=''
    variation.alt=''

def get_old_new_pos_substitution(feat_start,variation,feature_path_target_genome,feat):
    seg_pos=search_segment(variation.start_var)
    pos_old=str(int(Segments[seg_pos].start)-int(feat_start))

    start_feat_seg=feature_path_target_genome[0]
    var_start_seg=variation.start_on_target
    if variation.inversion:
        start_feat_seg=invert_seg(start_feat_seg)
        var_start_seg=invert_seg(var_start_seg)
        end_var=segments_on_target_genome[var_start_seg][2]
        start_feat=get_feature_start_on_target_genome_inv(start_feat_seg,feat)
        pos_new=str(start_feat-end_var)
    else:
        start_var=segments_on_target_genome[var_start_seg][1]
        start_feat=get_feature_start_on_target_genome(start_feat_seg,feat)
        pos_new=str(start_var-start_feat)
    return [pos_old,pos_new] # pos_old and pos_new are the base before the change

def get_old_new_pos_insertion(variation,feat_start,feature_path_target_genome,feat):
    seg_pos=search_segment(variation.start_var) # start_var is the segment AFTER the insertion
    pos_old=str(int(Segments[seg_pos].start)-int(feat_start))

    start_feat_seg=feature_path_target_genome[0]
    start_var_seg=variation.start_var
    if variation.inversion:
        start_feat_seg=invert_seg(start_feat_seg)
        start_var_seg=invert_seg(start_var_seg)
        end_var=segments_on_target_genome[start_var_seg][2]+len(variation.alt) # start_var_seg is the segment AFTER the insertion
        start_feat=get_feature_start_on_target_genome_inv(start_feat_seg,feat) 
        pos_new=str(start_feat-end_var)
    else:
        start_var=segments_on_target_genome[start_var_seg][1]-len(variation.alt) # start_var_seg is the segment AFTER the insertion
        start_feat=get_feature_start_on_target_genome(start_feat_seg,feat) 
        pos_new=str(start_var-start_feat)
    return [pos_old,pos_new] # pos_old and pos_new are the base before the change

def get_old_new_pos_deletion(variation,feat_start,feature_path_target_genome,feat):
    i=variation.start_var_index
    seg_pos=search_segment(variation.start_var)
    if i==0:
        pos_old=int(Segments[seg_pos].start)-int(feat_start)+Features[feat].pos_start-1
    else:
        pos_old=int(Segments[seg_pos].start)-int(feat_start)
        if pos_old<0:
            pos_old=0
            print("error with variation position",variation.inversion,"***")

    if variation.last_seg_in_target=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet. 
        pos_new=0
    else:
        start_feat_seg=feature_path_target_genome[0]
        start_var_seg=variation.last_seg_in_target
        if variation.inversion:
            start_feat_seg=invert_seg(start_feat_seg)
            start_var_seg=invert_seg(start_var_seg)
            start_var=segments_on_target_genome[start_var_seg][1]-1
            start_feat=get_feature_start_on_target_genome_inv(start_feat_seg,feat) 
            pos_new=str(start_feat-start_var)
        else:
            start_var=segments_on_target_genome[start_var_seg][2]+1
            start_feat=get_feature_start_on_target_genome(start_feat_seg,feat)
            pos_new=str(start_var-start_feat)
    return [pos_old,pos_new] # pos_old and pos_new are the base before the change


def init_new_var(variation,type,feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature):
    variation.type=type
    variation.start_var=feature_path_source_genome[i]
    variation.start_var_index=i
    if type=="substitution":
        variation.start_on_target=feature_path_target_genome[j]
        variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
        variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
        variation.seg_ref.append(feature_path_source_genome[i])
        variation.seg_alt.append(feature_path_target_genome[j])
    elif type=="insertion":
        variation.ref="-"
        variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
        variation.seg_alt.append(feature_path_target_genome[j])
    elif type=="deletion":
        if i==0: # if the deletion is at the start of the feature, the deletion doesnt always start at the start at the first segment : 
            #use pos_start, position of the feature on its first segment
            variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])[feature.pos_start-1:]
            variation.seg_ref.append(feature_path_source_genome[i])
        else: # else, the deletion will always start at the start of the first segment.
            variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
            variation.seg_ref.append(feature_path_source_genome[i])
        variation.alt="-"

def continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,genome_to_continue):

    if variation.type=="substitution":
        if genome_to_continue==0: # genome_to_continue allows to choose if the substitution continues for the original or the target genome, or both.
            variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
            variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
            variation.seg_ref.append(feature_path_source_genome[i])
            variation.seg_alt.append(feature_path_target_genome[j])
        elif genome_to_continue==1: # deletion
            variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
            variation.seg_ref.append(feature_path_source_genome[i])
        elif genome_to_continue==2: # insertion
            variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
            variation.seg_alt.append(feature_path_target_genome[j])
    elif variation.type=="insertion":
        variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
        variation.seg_alt.append(feature_path_target_genome[j])
    elif variation.type=="deletion":
        variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
        variation.seg_ref.append(feature_path_source_genome[i])

def get_common_segments(list1,list2):
    list_output=[]
    for elem in list1:
        if elem in list2:
            list_output.append(elem)
    return list_output

def compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand):
    # get the list of segments in common
    seg_common=[]
    for segment in list_1_unstrand:
        if segment in list_2_unstrand:
            seg_common.append(segment)

    # for each segment in common, check if the strand is the same. check index in list unstranded to get the segment in list stranded
    same_strand_count=0
    for segment in seg_common:
        index_1=list_1_unstrand.index(segment)
        index_2=list_2_unstrand.index(segment)
        if list_1[index_1]==list_2[index_2]:
            same_strand_count+=1
    return [seg_common,same_strand_count]

def detect_segment_order_inversion(list_1,list_2):
    if (len(list_1)==1) | (len(list_2)==1):
        return False
    [cpt,i]=[0,0]
    list_1_common=get_common_segments(list_1,list_2)
    list_2_common=get_common_segments(list_2,list_1)
    list_2_common_reversed=list(reversed(list_2_common))
    while i<len(list_1_common):
        if list_2_common_reversed[i]==list_1_common[i]:
            cpt+=1
        i+=1
    return (cpt>len(list_1_common)*0.9) # if more than 90% of the segments are on the same position when the lists are reversed, there is an inversion. 
    
def detect_orient_inversion(list_1,list_2):
    list_1_unstrand=[segment_stranded[1:] for segment_stranded in list_1]
    list_2_unstrand=[segment_stranded[1:] for segment_stranded in list_2]
    [seg_common,same_strand_count]=compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand)

    if same_strand_count>=len(seg_common)*0.9: # if more than 90% of segments shared have the same strand, no inversion
        strand_inversion=False
    else:
        strand_inversion=True
    return [strand_inversion,list_1_unstrand,list_2_unstrand]


# takes two lists of segments for two genes, check if the first list is an inversion of the second one (if the segments in common are on the opposite strand)
def detect_feature_inversion(list_1,list_2):
#    target_dict=dict.fromkeys(target_list,"")
 
    # check if we have an inversion of the orientation of the segments
    [strand_inversion,list_1_unstrand,list_2_unstrand]=detect_orient_inversion(list_1,list_2)

    # check if we have an inversion of the order of the segments
    segment_order_inversion=detect_segment_order_inversion(list_1_unstrand,list_2_unstrand)

    # if there we have both inversions, the gene is in an inverted region. reverse the second list for the comparison.
    if segment_order_inversion & strand_inversion:
        return True
    else :
        return False

def invert_segment_list(seg_list):
    list_inverted=list()
    for seg in seg_list:
        list_inverted.append(invert_seg(seg))
    return list(reversed(list_inverted))