Functions.py

import subprocess
from Graph_gff import Segments, Features, get_start, get_stop


# functions to generate a genome's gff from the graph's gff

# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_start_2(seg_start, seg_genome, feat_id):
    s_start=seg_genome[seg_start][1]
    f_start=get_start(seg_start,feat_id)
    
    start=int(s_start)+int(f_start)-1
    return start


# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_genome, feat_id):
    s_start=seg_genome[seg_stop][1]
    f_stop=get_stop(seg_stop,feat_id)

    stop=int(s_start)+int(f_stop)-1
    return stop


# get the position of a part of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,feat_start,feature,seg_genome):
    
    # start position of the entire feature on the reference
    start_gene_start=Segments[feat_start[1:]].start+get_start(feat_start[1:],feature.id)-1
    
    # start position of the part of the feature on the reference
    start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
    
    # start and stop position of the feature on the genome we transfer on, to get the length of the part of the feature
    start_new_genome=get_start_2(seg_start,seg_genome,feature.id) 
    stop_new_genome=get_stop_2(seg_stop,seg_genome,feature.id)
    
    # start position of the part of the feature on the complete feature
    start_gene=int(start_first_seg)-int(start_gene_start)+1
    # stop position : start+length
    stop_gene=start_gene+(stop_new_genome-start_new_genome)
    
    position=";position="+str(start_gene)+"-"+str(stop_gene)
    #position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
    return position


# get the proportion of a part of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_genome,feature):
    start_new_genome=get_start_2(seg_start,seg_genome,feature.id) # start position of the feature on the genome we transfer on
    stop_new_genome=get_stop_2(seg_stop,seg_genome,feature.id) # stop position of the feature on the genome we transfer on
    
    proportion=";proportion="+str(stop_new_genome-start_new_genome+1)+"/"+str(feature.size)
    #proportion=";number_bases="+str(stop_new_genome-start_new_genome+1)+";total_bases="+str(feature.size)
    return proportion


# returns the gff line to write in the output file (gff detail)
def write_line(list_seg,i,feat,seg_start,feat_start,seg_genome):
    seg_stop=list_seg[i-1]
    feature=Features[feat]
    chr=seg_genome[seg_start[1:]][0]
    strand=list_seg[i-1][0:1]
    
    # start and stop position of the feature on the genome we transfer on
    start_new_genome=get_start_2(seg_start[1:],seg_genome,feature.id) 
    stop_new_genome=get_stop_2(seg_stop[1:],seg_genome,feature.id)
    
    proportion=get_proportion(seg_start[1:],seg_stop[1:],seg_genome,feature)
    position=get_position(seg_start[1:],seg_stop[1:],feat_start,feature,seg_genome)
    
    annotation=feature.annot+proportion+position
    
    out_line=chr+"\tGrAnnot\t"+feature.type+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
    return out_line


# takes a list of stranded segments, returns a list of unstranded segments
def remove_strand(list_seg):
    list_unstrand=list()
    for seg_stranded in list_seg:
        list_unstrand.append(seg_stranded[1:])
    return list_unstrand


# takes two lists of segments for two genes, check if the first list is an inversion of the second one (if most of the segments in common are on the opposite strand)
def detect_inversion(list_1,list_2):
    list_1_unstrand=remove_strand(list_1)
    list_2_unstrand=remove_strand(list_2)

    # get the list of segments in common
    seg_common=list()
    for seg in list_1_unstrand:
        if seg in list_2_unstrand:
            seg_common.append(seg)
    
    # for each segment in common, check if the strand is the same. check index in list unstranded to get the segment in list stranded
    same_strand_count=0
    for seg in seg_common:
        index_1=list_1_unstrand.index(seg)
        index_2=list_2_unstrand.index(seg)
        if list_1[index_1]==list_2[index_2]:
            same_strand_count+=1

    # if there is less than 10% of strand difference among the common segments (ie more than 90% of same strand), return False, no inversion 
    if len(seg_common)-same_strand_count<len(seg_common)/10:
        return False
    else:
        return True


# takes a feature and a feature type, returns a list of child features that have the wanted type
def get_list_childs(feature,child_type):
    list_childs=list()
    for child in feature.childs:
        if Features[child].type==child_type:
            list_childs.append(child)
    return list_childs


# print stats on the transfer : number of feature that have segments in different positions missing. 
def stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok):

    # creates files with the features by category (missing first, middle, end segment, etc)
    feat_miss_first=open('features_missing_first.txt','w')
    feat_miss_first.write("features missing the first segment on azucena :\n")
    for first in feature_missing_first :
        feat_miss_first.write(Features[first].annot)
        if Features[first].type!='gene':
            feat_miss_first.write(';')
            feat_miss_first.write(Features[first].note)
        feat_miss_first.write("\n")
    feat_miss_first.close()
        
    feat_miss_middle=open('features_missing_middle.txt','w')
    feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
    for middle in feature_missing_middle :
        feat_miss_middle.write(Features[middle].annot)
        if Features[middle].type!='gene':
            feat_miss_middle.write(';')
            feat_miss_middle.write(Features[middle].note)
        feat_miss_middle.write("\n")
    feat_miss_middle.close()
       
    feat_miss_last=open('features_missing_last.txt','w')
    feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
    for last in feature_missing_last :
        feat_miss_last.write(Features[last].annot)
        if Features[last].type!='gene':
            feat_miss_last.write(';')
            feat_miss_last.write(Features[last].note)
        feat_miss_last.write("\n")
    feat_miss_last.close()
    
    feat_miss=open('features_missing.txt','w')
    feat_miss.write("\nfeatures missing entirely on azucena :\n")
    for entier in feature_missing_all :
        feat_miss.write(Features[entier].annot)
        if Features[entier].type!='gene':
            feat_miss.write(';')
            feat_miss.write(Features[entier].note)
        feat_miss.write("\n")
    feat_miss.close()

    feat_miss_total=open('features_missing_total.txt','w')
    feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
    for total in feature_missing_total :
        feat_miss_total.write(Features[total].annot)
        if Features[total].type!='gene':
            feat_miss_total.write(';')
            feat_miss_total.write(Features[total].note)
        feat_miss_total.write("\n")
    feat_miss_total.close()
    
    feat_all=open('all_features.txt','w')
    for feat in list_feature:
        feat_all.write(Features[feat].annot)
        if Features[feat].type!='gene':
            feat_all.write(';')
            feat_all.write(Features[feat].note)
        feat_all.write("\n")
    feat_all.close()

    feat_ok=open('features_ok.txt','w')
    feat_ok.write("\nfeatures entirely present in azucena :\n")
    for ok in feature_ok :
        feat_ok.write(Features[ok].annot)
        if Features[ok].type!='gene':
            feat_ok.write(';')
            feat_ok.write(Features[ok].note)
        feat_ok.write("\n")
    feat_ok.close()

    # prints the stats for the features (hypothetical/putative rate, by category)
    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
    total=len(feature_missing_first)
    print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
    total=len(feature_missing_middle)
    print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
    total=len(feature_missing_last)
    print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
    total=len(feature_missing_all)
    print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
    total=len(feature_missing_total)
    print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
    total=len(feature_ok)
    print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
    total=len(list_feature)
    print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    # delete the files created for the stats
    command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
    subprocess.run(command,shell=True)


# outputs each fragment of th feature in a gff
def gff_detail(list_seg,seg_genome,feat,file_out_detail):
    # loop that goes through all the segments that have the current feature
    # keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the current part of the fragment, and resets the first segment present.
    # continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature.
    # etc.
    # at the end of the loop, prints the last part of the fragment.

    feat_start="empty" # stocks the first segment of the feature
    seg_start="empty" # stocks the first segment of the current part of the feature
    #seg_stop="empty" # stocks the last segment of the current part of the feature

    for i in range(0,len(list_seg)):
        if list_seg[i][1:] in seg_genome: # look for a segment present in the genome
            if feat_start=="empty":
                feat_start=list_seg[i]
    
            if seg_start=="empty": # if we dont have a start, take the current segment for the start of the part of the feature
                seg_start=list_seg[i]
                
            #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
                
        else:
            if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
                    
                out_line=write_line(list_seg,i,feat,seg_start,feat_start,seg_genome)
                file_out_detail.write(out_line)
                seg_start="empty"
                #seg_stop="empty"
            
            #else: if the current segment is not in azucena but there is no start, keep looking for a start
                
                
    # print the last part of the feature in the end
    if list_seg[i][1:] in seg_genome:
        out_line=write_line(list_seg,i+1,feat,seg_start,feat_start,seg_genome)
        file_out_detail.write(out_line)


# outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :
def gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n):
    
    # count the number of features not transfered because they are too big or too small on the new genome
    bad_size=0
    # count the number of features absent on the new genome
    absent_features=0

    if (first_seg!='0') & (last_seg!='0'):

        start2=get_start_2(first_seg,seg_genome,feat)
        stop2=get_stop_2(last_seg,seg_genome,feat)
        size_on_new_genome=int(stop2)-int(start2)+1
        size_diff=abs(size_on_new_genome-Features[feat].size)

        # if the gene on the new genome is not 2 times bigger or smaller than the original, write its line in the gff
        if not ((size_on_new_genome>Features[feat].size*n) | (size_on_new_genome<Features[feat].size/n)) : 
            chr=seg_genome[first_seg][0]
            strand=list_seg[0][0:1]
            
            nb_variants=0
            for segment in list_seg:
                if segment[1:] not in seg_genome: # if a segment is absent, it is either a deletion of a substitution. insertions are not considered here.
                    nb_variants+=1

            annotation=Features[feat].annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(nb_variants)

            line=chr+"  GrAnnot   "+Features[feat].type+" "+str(get_start_2(first_seg,seg_genome,feat))+" "+str(get_stop_2(last_seg,seg_genome,feat))+"   .   "+strand+"   .   "+annotation+"\n"
            file_out.write(line)
        else:
            bad_size+=1
    else :
        absent_features+=1
    
    return list([bad_size,absent_features])


# outputs the detail of the variations on the new genome for the feature
def print_variations(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq,list_var,list_var_rate):
    string_print=""
    cpt_print=0
    
    # list of size difference, to get the average size difference
    diff_list=list()
    list_var_temp=[0,0,0,0,0]

    # create type_n the index of the list of the lists in list_var. for the stats of the number of variation by type of feature.
    type=Features[feat].type
    match type:
        case "gene":
            type_n=0
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "mRNA":
            type_n=1
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "intron":
            type_n=2
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "exon":
            type_n=3
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "three_prime_UTR":
            type_n=4
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "five_prime_UTR":
            type_n=5
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "CDS":
            type_n=6
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size


    # get the list of the segments in a feature that are missing in the genome
    segments_missing=list()
    for segment in list_seg:
            if segment[1:] not in seg_genome:
                segments_missing.append(Segments[segment[1:]])

    # if there are missing segments, print them
    if (first_seg!='0') & (last_seg!='0'):

        var=0 # count variations, to see if there is any

        feature=Features[feat]
        feat_start=feature.start

        # get the lengths of the feature, on the original genome and on the new one
        start_new_genome=get_start_2(first_seg,seg_genome,feat)
        stop_new_genome=get_stop_2(last_seg,seg_genome,feat)
        size_new_genome=int(stop_new_genome)-int(start_new_genome)+1
        size_diff=str(size_new_genome-feature.size)

      
        diff_list.append(abs(int(size_diff))) # to have stats on the size of the segments missing
        #line=feat+" : \nlength on the original genome = "+str(feature.size)+"\nlength difference (length in the new genome-length in the original genome) = "+size_diff+"\n"
        #file_out2.write(line)

        
        # find the path in azucena. 
        first_strand=seg_genome[first_seg][3]
        first_seg_stranded=first_strand+first_seg
        last_strand=seg_genome[last_seg][3]
        last_seg_stranded=last_strand+last_seg
        id_first_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(first_seg_stranded))
        id_last_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(last_seg_stranded))
        list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][id_first_seg:id_last_seg+1]
        list_segfeat_nb=Features[feat].segments_list

        # loop to go through both paths
        i=0
        j=0

        last=''
        last_taille=0
        last_seq=''
        last_start=''
       #pos=''
        last_in_azu=''

        # check if there is an inversion
        inversion=detect_inversion(list_segfeat_nb,list_segfeat_azu)
        if inversion:
            inverted="1"
        else:
            inverted="0"
        #if detect_inversion(list_segfeat_nb,list_segfeat_azu):
        #    line="inversion detected\n"
        #    file_out2.write(line)

        # remove the strands from the lists of segments :
        list_segfeat_nb=remove_strand(list_segfeat_nb)
        list_segfeat_azu=remove_strand(list_segfeat_azu)

        # detect and print variations ignoring the strands
        while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
            if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
                if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
                    if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
                        # is both segments are absent in the other genome, its a substitution
                        last_in_azu=list_segfeat_azu[j]

                        # print if we had an insertion or deletion running
                        if last=='insertion':
                            pos_old=str(int(Segments[last_start].start)-int(feat_start)+1)

                            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                            start_var=int(seg_genome[last_start][1])-1
                            pos_new=str(start_var-start_feat+1)

                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            string_print+=line
                            cpt_print+=1
                            #file_out2.write(line)
                            list_var[1][type_n]+=1
                            list_var_temp[1]+=1
                            var+=1

                        elif last=='deletion':
                            pos_old=int(Segments[last_start].start)-int(feat_start)+1

                            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                            start_var=int(seg_genome[last_in_azu][2])+1 
                            pos_new=str(start_var-start_feat+1)

                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            string_print+=line
                            cpt_print+=1
                            #file_out2.write(line)
                            list_var[0][type_n]+=1
                            list_var_temp[0]+=1                            
                            var+=1

                        last='';last_taille=0;last_seq='';last_start=''

                        # print the substitution
                        # substitution of segment list_segfeat_nb[i][1:] by segment list_segfeat_azu[j][1:]
                        pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
                        start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
                        start_var=int(seg_genome[list_segfeat_azu[j-1]][2])+1
                        pos_new=str(start_var-start_feat+1)
                        # start du fragment (var) - start du gène +1

                        # start de la var (donc fin du segment d'avant +1) = seg_genome[id_seg][2]+1
                        # -> trouver le segment d'avant -> j-1.
                        

                        # start du gène sur azu -> get_start_2 premier_seg, seg_genome, feature_id
                        # feat_id = feat
                        # seg_genome = seg_gneome
                        # premier_seg sur azu -> list_segfeat_azu[0]
                        

                        # seg_genome donne les positions sur azu
                        # seg_genome[seg]=list([ch,start,stop,strand])
                        if len(seg_seq[list_segfeat_nb[i]]) == len(seg_seq[list_segfeat_azu[j]]): # if the substituion is between two segment of the same size, print it
                            size_subs=len(seg_seq[list_segfeat_nb[i]])
                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tsubstitution\t"+seg_seq[list_segfeat_nb[i]]+"\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(size_subs)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                        else :
                            # if the segments of the substitution have a different size, print deletion then insertion at the same position.
                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+seg_seq[list_segfeat_nb[i]]+"\t-\t"+str(len(seg_seq[list_segfeat_nb[i]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            line+=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(len(seg_seq[list_segfeat_azu[j]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            var+=1
                        string_print+=line
                        cpt_print+=1
                        #file_out2.write(line)
                        list_var[2][type_n]+=1
                        list_var_temp[2]+=1
                        var+=1
                        # go to the next segments on the paths
                        i+=1;j+=1

                    else: # azu segment not in nb, but nb segment in azu : insertion
                        if last=='deletion':
                            if i==0:
                                pos_old=int(Segments[last_start].start)-int(feat_start)+1+feature.pos_start
                            else:
                                pos_old=int(Segments[last_start].start)-int(feat_start)+1

                            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                            start_var=int(seg_genome[last_in_azu][2])+1 
                            pos_new=str(start_var-start_feat+1)

                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            string_print+=line
                            cpt_print+=1
                            #file_out2.write(line)
                            list_var[0][type_n]+=1
                            list_var_temp[0]+=1
                            var+=1;last='';last_taille=0;last_start='';last_seq=''

                        last_in_azu=list_segfeat_azu[j]

                        if last=='insertion':
                            #last_taille+=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
                            last_seq=last_seq+seg_seq[list_segfeat_azu[j]]
                        else:
                            last='insertion'
                            last_seq=seg_seq[list_segfeat_azu[j]]
                            #last_taille=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
                            last_start=list_segfeat_nb[i]
                        j+=1
                        
                elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion
                    if last=='insertion':
                        pos_old=int(Segments[last_start].start)-int(feat_start)+1
                        
                        start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                        start_var=int(seg_genome[last_start][1])-1
                        pos_new=str(start_var-start_feat+1)
                        
                        line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                        string_print+=line
                        cpt_print+=1
                        #file_out2.write(line)
                        list_var[1][type_n]+=1
                        list_var_temp[1]+=1
                        var+=1;last='';last_start='';last_taille=0;last_seq=''

                    if last=='deletion':
                        #last_taille+=Segments[list_segfeat_nb[i]].size
                        last_seq=last_seq+seg_seq[list_segfeat_nb[i]]
                    else:
                        last='deletion'
                        last_start=list_segfeat_nb[i]
                        #last_taille=Segments[list_segfeat_nb[i]].size
                        if i==0: # if the deletion is at the start of the feature, the deletion doesnt start at the start at the first segment : 
                                    #use pos_start, position of the feature on its first segment
                            last_seq=seg_seq[list_segfeat_nb[i]][feature.pos_start-1:]
                        else: # else, the deletion will always start at the start of the first segment.
                            last_seq=seg_seq[list_segfeat_nb[i]]
                        #if i==0:
                        #    start=1
                    i+=1
                    
                else : # idk yet. if both segments are present in the other genome but not at the same position. probably substitution then
                    line="weird order change"
                    string_print+=line
                    cpt_print+=1
                    #file_out2.write(line)
                    var+=1;i+=1;j+=1

            else: # segment present in both. print the running indel if there is one
                if last=='insertion':
                    pos_old=int(Segments[last_start].start)-int(feat_start)+1

                    start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                    start_var=int(seg_genome[last_start][1])-1
                    pos_new=str(start_var-start_feat+1)
                    
                    line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                    string_print+=line
                    cpt_print+=1
                    #file_out2.write(line)
                    list_var[1][type_n]+=1
                    list_var_temp[1]+=1
                    var+=1

                elif last=='deletion':
                    if i==0:
                        pos_old=int(Segments[last_start].start)-int(feat_start)+1+feature.pos_start
                    else:
                        pos_old=int(Segments[last_start].start)-int(feat_start)+1

                    if last_in_azu=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet. 
                        pos_new="1"
                    else :
                        start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                        start_var=int(seg_genome[last_in_azu][2])+1 
                        pos_new=str(start_var-start_feat+1)
                    
                    line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                    string_print+=line
                    cpt_print+=1
                    #file_out2.write(line)
                    list_var[0][type_n]+=1
                    list_var_temp[0]+=1
                    var+=1

                last_in_azu=list_segfeat_azu[j]

                last='';last_taille=0;last_start='';last_seq='';i+=1;j+=1

                
        # finish printing the current indel
        if last=='insertion':
            pos_old=int(Segments[last_start].start)-int(feat_start)+1

            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
            start_var=int(seg_genome[last_start][1])-1
            pos_new=str(start_var-start_feat+1)

            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(last_taille)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
            string_print+=line
            cpt_print+=1
            #file_out2.write(line)
            list_var[1][type_n]+=1
            list_var_temp[1]+=1
            var+=1

        elif last=='deletion':
            pos_old=int(Segments[last_start].start)-int(feat_start)+1

            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
            start_var=int(seg_genome[last_in_azu][2])+1 
            pos_new=str(start_var-start_feat+1)

            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
            string_print+=line
            cpt_print+=1
            #file_out2.write(line)
            list_var[0][type_n]+=1
            list_var_temp[0]+=1
            var+=1


        # see if the end is missing for one of the two genomes
        if not((i>=len(list_segfeat_nb)-1) & (j>=len(list_segfeat_azu)-1)):
            if (i<len(list_segfeat_nb)-1):
                if not (j>=len(list_segfeat_azu)-1):
                    print('pb')
                # length of the deletion = position of the end of the feature - position of the start of the deletion
                #length=feature.stop-Segments[list_segfeat_nb[i]].start # pas bon, on prend le dernier segment en entier. len(del_sequence) plus sur. 
                pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1

                del_sequence=""
                for k in range(i,len(list_segfeat_nb)):
                    if k==len(list_segfeat_nb):
                        del_sequence+=seg_seq[list_segfeat_nb[k]][0:feature.pos_stop]
                    else:
                        del_sequence+=seg_seq[list_segfeat_nb[k]]
                length=len(del_sequence)

                pos_new=str(length) # the deletion is at the end of the feature on the new genome

                line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+del_sequence+"\t-\t"+str(length)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                string_print+=line
                cpt_print+=1
                #file_out2.write(line)
                list_var[0][type_n]+=1
                var+=1
            elif (j<len(list_segfeat_azu)-1):
                print("ins at the end ?")
            #   string_print+=line   
            #   #file_out2.write("insertion at the end\n")
            #   list_var[1][type_n]+=1
            #   var+=1

        if var==0:
            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(size_new_genome)+"\t"+size_diff+"\tno_var\t-\t-\t-\t-\t-\n"
            string_print+=line
            cpt_print+=1
            #file_out2.write(line)
    #else:
        #line=feat+" : feature entirely absent.\n"
        #file_out2.write(line)


    for i in [0,1,2]:
        if list_var_temp[4]!=0:
            list_var_rate[i][type_n].append(list_var_temp[i]/list_var_temp[4])
            # no need to return this list, it modifies the original list directly

    if string_print!="":
        file_out2.write(string_print)

    return diff_list


# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
    
    print("generation of the genome's gff ")
    
    # create a dictionnary with the positions of the segments on the genome to transfer on
    seg_genome={}
    bed=open(pos_seg,'r')
    lines=bed.readlines()
    for line in lines:
        line=line.split()
        if line[3][0:1]=='>':
            strand='+'
        elif line[3][0:1]=='<':
            strand='-'
        else:
            strand=''
        seg=line[3][1:]
        ch=line[0]
        start=line[1]
        stop=line[2]
        seg_genome[seg]=list([ch,start,stop,strand])
    bed.close()

    
    # create a dictionnary with the sequences of the segments of the graph and a dictionary with the paths from the gfa. each genome name gives the list of its segments in order
    file_gfa=open(gfa,'r')
    lines_gfa=file_gfa.readlines()
    file_gfa.close()
    seg_seq={}
    paths={}
    for line in lines_gfa:
        line=line.split()
        if (line[0]=="S"): # get the sequence of the segment
            seg_id='s'+line[1]
            seg_seq[seg_id]=line[2]

        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
            
            path=line[6].replace(">",";>")
            path=path.replace("<",";<").split(';')
            list_path=list()
            for segment in path:
                if segment[0:1]=='>':
                    list_path.append('+s'+segment[1:])
                elif segment[0:1]=='<':
                    list_path.append('-s'+segment[1:])
                else:
                    list_path.append('s'+segment[1:])
 
            paths[line[3]]=list_path


    file_out_detail = open("azucena_chr10_detail.gff", 'w')
    file_out=open(out,'w')
    file_out2 = open("variations.txt", 'w')

    diff_list=list()
    bad_size=0
    absent_features=0

    # for stats by type of variation
    del_gene=0;del_mrna=0;del_intron=0;del_exon=0;del_3utr=0;del_5utr=0;del_cds=0
    deletions=[del_gene,del_mrna,del_intron,del_exon,del_3utr,del_5utr,del_cds]

    ins_gene=0;ins_mrna=0;ins_intron=0;ins_exon=0;ins_3utr=0;ins_5utr=0;ins_cds=0
    insertions=[ins_gene,ins_mrna,ins_intron,ins_exon,ins_3utr,ins_5utr,ins_cds]

    sub_gene=0;sub_mrna=0;sub_intron=0;sub_exon=0;sub_3utr=0;sub_5utr=0;sub_cds=0
    substitutions=[sub_gene,sub_mrna,sub_intron,sub_exon,sub_3utr,sub_5utr,sub_cds]

    nb_type=[0,0,0,0,0,0,0]
    length_type=[0,0,0,0,0,0,0]

    list_var=[deletions,insertions,substitutions,nb_type,length_type]

    deletions_rate=[list(),list(),list(),list(),list(),list(),list()]
    insertions_rate=[list(),list(),list(),list(),list(),list(),list()]
    substitutions_rate=[list(),list(),list(),list(),list(),list(),list()]
    list_var_rate=[deletions_rate,insertions_rate,substitutions_rate]

    stats=False
    plot=False
  
    # get the list of all the features to transfer from the gff
    gff=open(gff,'r')
    list_feature=list()
    lines=gff.readlines()
    for line in lines:
        id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
        if id not in list_feature:
            list_feature.append(id)
    gff.close() 
    
    # create objects for stats on how many segments are absent in azucena, their average length, etc
    if stats==True:
        seg_first=list()
        seg_middle=list()
        seg_last=list()
        seg_ok=list()
        seg_entier=list()
        seg_total=list()

        feature_missing_first=list()
        feature_missing_middle=list()
        feature_missing_last=list()
        feature_missing_all=list()
        feature_missing_total=list()
        feature_total=list()
        feature_ok=list()


    for feat in list_feature:

        # for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome
        list_seg=Features[feat].segments_list

        first_seg="0"
        for segment in list_seg:
            if segment[1:] in seg_genome:
                first_seg=segment[1:]
                break
        last_seg="0"
        for segment in reversed(list_seg):
            if segment[1:] in seg_genome:
                last_seg=segment[1:]
                break


        # stats on how many segments are absent in azucena, their average length, etc
        

        # for the segments

        # segment missing that is at the start of a feature - seg_first
        # segment missing that is at the end of a feature - seg_last
        # segment missing that is in the middle of a feature - seg_middle
        # segment missing that contains the entire feature - seg_entier
        # total number of genes missing - seg_total_abs
        # segments not missing - seg_ok
        # segments missing of not - seg_total

        if (stats==True):

            for segment in list_seg: # counts the segments in each category
                seg_len=Segments[segment[1:]].size
                if segment[1:] not in seg_genome: # the segment is missing
                    seg_total.append(seg_len)
                    if segment==list_seg[0]: # the segment is the first of the feature
                        if segment==list_seg[-1]: # the segment is the last of the feature
                            seg_entier.append(seg_len) # the segment is the first and the last, so it contains the entire feature
                        else: 
                            seg_first.append(seg_len) 
                    elif segment==list_seg[-1]: # the segment is the last of the feature
                        seg_last.append(seg_len)
                    else:
                        seg_middle.append(seg_len)
                else: # the segment is present on the genome
                    seg_ok.append(seg_len)

        # for the features : 
        # the fist segment of the feature is missing - feature_missing_first
        # the last segment of the feature is missing - feature_missing_last
        # at least one middle segment of the feature is missing - feature_missing_middle
        # the entire feature is missing ->feature_missing_all
        # at least one segment is missing first, last, or middle) - feature_missing_total
        # no segment is missing, the feature is complete - feature_ok  
        # total number of features, with missing segments or not - feature_total

        if (stats==True):

            feature_total.append(feat)

            if (first_seg=='0') & (last_seg=='0') : # no segment of the feature is in the genome, the feature is missing entirely
                feature_missing_all.append(feat)
            elif first_seg != list_seg[0][1:]: # the first segment is missing 
                feature_missing_first.append(feat)
            elif last_seg!=list_seg[-1][1:]: # the last segment is missing
                feature_missing_last.append(feat)

            # go through all the segments, check if some are missing in the middle of the feature
            elif (len(list_seg)!=1) & (feat not in feature_missing_all): # to access the second to last element
                for segment in list_seg[1-(len(list_seg)-2)]:
                    if segment[1:] not in seg_genome:
                        feature_missing_middle.append(feat)
                        break


            # go through the segments, to see if one is missing anywhere on the feature
            for segment in list_seg:
               if segment[1:] not in seg_genome:
                   if feat not in feature_missing_total:
                        feature_missing_total.append(feat)
                        break

            # if the feature doesnt have a missing segment, it is complete.         ADD THE PATH CHECK !!
            if feat not in feature_missing_total:
                feature_ok.append(feat)


        # outputs each fragment of the feature in a gff  
        gff_detail(list_seg,seg_genome,feat,file_out_detail) # insertions !
        
        # outputs the feature once in a gff, from the first to the last segment present on the new genome
        n=2 # maximum difference size (n times bigger of smaller)
        result=gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n)
        bad_size+=result[0]
        absent_features+=result[1]
        
        # outputs the detail of variations for the feature :
        variations_output=print_variations(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq,list_var,list_var_rate)
        diff_list+=variations_output

    
    file_out_detail.close()
    file_out.close()
    file_out2.close()

    # changer la forme de l'output des variations
    #print(list_var)
    #print(list_var_rate)
    #print(sum(list_var_rate[1][0])/len(list_var_rate[1][0]))


    if plot==True:
        import seaborn as sns
        import matplotlib.pyplot as plt
        # set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
        sns.set(style="darkgrid")

        import pandas as pd

        df_deletions = pd.DataFrame(list_var_rate[0])#, columns=['genes','mrna','introns','exons','3utr','5utr','cds'])
        df_deletions=df_deletions.T
        df_deletions.columns = ['genes', 'mrna', 'introns', 'exons', '3utr', '5utr', 'cds']

        sns.violinplot(data=df_deletions)
        plt.ylim(-0.2, 0.2)
        plt.show()

        df_insertions = pd.DataFrame(list_var_rate[1])#, columns=['genes','mrna','introns','exons','3utr','5utr','cds'])
        df_insertions=df_insertions.T
        df_insertions.columns = ['genes', 'mrna', 'introns', 'exons', '3utr', '5utr', 'cds']

        sns.violinplot(data=df_insertions)
        plt.ylim(-0.2, 0.2)
        plt.show()


        df_substitutions = pd.DataFrame(list_var_rate[2])#, columns=['genes','mrna','introns','exons','3utr','5utr','cds'])
        df_substitutions=df_substitutions.T
        df_substitutions.columns = ['genes', 'mrna', 'introns', 'exons', '3utr', '5utr', 'cds']

        sns.violinplot(data=df_substitutions)
        plt.ylim(-0.2, 0.2)
        plt.show()


    # print stats
    from statistics import median, mean
    if stats==True:
        print(len(Features)-(bad_size+absent_features),"out of",len(Features),"features are transfered.")
        print(bad_size,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
        print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
        if len(diff_list)!=0:
            print("average length difference of the transfered genes : ",sum(diff_list)/len(diff_list))

        stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok)

        # prints the stats for the segments
        #print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
        #print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
        #print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
        #print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
        #print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
        #print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))