Functions.py

import ClassSegFeat as Class
import subprocess

# functions to create the segments and the features

# create a segment with its first feature
def init_seg(line,feature):
    line=line.split()
    seg_id=line[3][1:]
    chr=line[0]
    start=line[1]
    stop=line[2]
    
    # add the current feature to the list of features that are on the segment
    feature_strand=line[10]
    feature_stranded=feature_strand+feature
    feat=list()
    feat.append(feature_stranded)
    
    # create the segment
    Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)


# create a feature with the segment its on
def init_feature(line):
    line=line.split()
    feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
    type=line[6]
    annot=line[12]
    chr=line[4]
    start=line[7]
    stop=line[8]
    childs=list()
    
    # add the current segment to the list of segments that have the feature
    seg=line[3][1:]
    strand=line[10]
    seg_stranded=strand+seg
    segments_list=list()
    segments_list.append(seg_stranded)

    # if the current feature has a parent, add the current feature in the childs list of its parent
    if annot.split(";")[1].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    
    elif annot.split(";")[2].split("=")[0]=="Parent":
    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    else: parent=""

    # create the feature
    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)


# add a feature to an existing segment
def add_feature(seg,new_feature,strand):
    new_feature_stranded=strand+new_feature
    if new_feature_stranded not in Segments[seg].features:
        Segments[seg].features.append(new_feature_stranded)
    

# add a child feature to an existing feature
def add_child(feat,new_child):
    if feat in Features.keys(): # if the parent feature exists
        if new_child not in Features[feat].childs:
            Features[feat].childs.append(new_child)
    
    
# add a segment to an existing feature
def add_seg(feat,new_seg,strand):
    seg_stranded=strand+new_seg
    if seg_stranded not in Features[feat].segments_list:
        Features[feat].segments_list.append(seg_stranded)


# create a note for the child features that do not have annotation. 
def set_note(id):
    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
    feat=Features[id]
    if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
        feat.note=feat.annot.split(';')[-1]
    else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
        # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
        curent=feat.parent
        annot_found=False
        while annot_found==False:
            if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
                note=Features[curent].annot.split(';')[-1]
                feat.note=note
                annot_found=True
            else: # if we didn't find the gene, we go back to the current feature's parent until we find it
                curent=Features[Features[curent].parent].id


# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):    

    global Features
    global Segments
    Segments={}
    Features={}
    
    # open the file with the intersect between the segments and the gff
    file = open(intersect_path, 'r')
    lines=file.readlines()
    
    for line in lines:
        # get the ids for the dictionnaries' keys
        feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
        segment_id=line.split()[3][1:]
    
        if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
            init_feature(line)
        else: # if it exists, add the current segment to the list of segments that have the existing feature
            strand=line.split()[10]
            add_seg(feature_id,segment_id,strand)
    
        if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
            init_seg(line, feature_id)
        
        else: # if it exists, add the current feature to the list of features on the existing segment
            strand=line.split()[10]
            add_feature(segment_id,feature_id,strand)

    # for all the features, add the note (information on the function of the feature), and the positions on the first and last seg.
    for feat_id in Features:
        feat=Features[feat_id]
        set_note(feat.id)
        feat.pos_start=get_start(feat.segments_list[0][1:],feat.id)
        feat.pos_stop=get_stop(feat.segments_list[-1][1:],feat.id)

    file.close()


# functions to generate the graph's gff from the segments and features created with create_seg_feat

# get the feature's start position on the segment
def get_start(seg,feat):
    s=Segments[seg]
    f=Features[feat]
    if s.start>f.start:
        result=1
    else:
        result=f.start-s.start
    return result


# get the feature's stop position on the segment
def get_stop(seg,feat):
    s=Segments[seg]
    f=Features[feat]    
    if s.stop<f.stop:
        result=s.size
    else:
        result=f.stop-s.start
    return result


# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
    print("generation of the graph's gff")
    file_out = open(file_out, 'w')

    for segment in Segments:
        # get the list of the features on the segment
        features_seg=Segments[segment].features
        
        # go through all these features, and print the gff line for each
        for feature_stranded in features_seg:
            strand=feature_stranded[0:1]
            feature=feature_stranded[1:]
            segment_stranded=strand+segment
            type=Features[feature].type
            start=get_start(segment,feature)
            stop=get_stop(segment,feature)
                
            # get the rank and the total number of ocurrences for the feature
            rank=str(Features[feature].segments_list.index(segment_stranded)+1)
            total=str(len(Features[feature].segments_list))

            # create the annotation with the rank information
            annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total

            # write the gff line in the output file
            line=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
            file_out.write(line)

    file_out.close()


# functions to generate a genome's gff from the graph's gff

# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_start_2(seg_start, seg_genome, feat_id):
    ns=seg_start[1:]
    s_start=seg_genome[seg_start][1]
    f_start=get_start(seg_start,feat_id)
    
    start=int(s_start)+int(f_start)
    return start


# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_genome, feat_id):
    s_start=seg_genome[seg_stop][1]
    f_stop=get_stop(seg_stop,feat_id)

    stop=int(s_start)+int(f_stop)
    return stop


# get the position of a part of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,feat_start,feature,seg_genome):
    
    # start position of the entire feature on the reference
    start_gene_start=Segments[feat_start[1:]].start+get_start(feat_start[1:],feature.id)-1
    
    # start position of the piece of the feature on the reference
    start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
    
    # start and stop position of the feature on azucena, to get the length of the piece of the feature
    start_azu=get_start_2(seg_start,seg_genome,feature.id) 
    stop_azu=get_stop_2(seg_stop,seg_genome,feature.id)
    
    # start position of the piece of the feature on the complete feature
    start_gene=int(start_first_seg)-int(start_gene_start)+1
    # stop position : start+length
    stop_gene=start_gene+(stop_azu-start_azu)
    
    position=";position="+str(start_gene)+"-"+str(stop_gene)
    #position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
    return position


# get the proportion of a part of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_genome,feature):
    start_azu=get_start_2(seg_start,seg_genome,feature.id) # start position of the feature on azucena
    stop_azu=get_stop_2(seg_stop,seg_genome,feature.id) # stop position of the feature on azucena
    
    proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
    #proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
    return proportion


# returns the gff line to write in the output file
def write_line(list_seg,i,feat,seg_start,feat_start,seg_genome):
    seg_stop=list_seg[i-1]
    feature=Features[feat]
    chr=seg_genome[seg_start[1:]][0]
    strand=list_seg[i-1][0:1]
    
    # start and stop position of the feature on azucena
    start_azu=get_start_2(seg_start[1:],seg_genome,feature.id) 
    stop_azu=get_stop_2(seg_stop[1:],seg_genome,feature.id)
    
    proportion=get_proportion(seg_start[1:],seg_stop[1:],seg_genome,feature)
    position=get_position(seg_start[1:],seg_stop[1:],feat_start,feature,seg_genome)
    
    annotation=feature.annot+proportion+position
    
    out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
    return out_line


def stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok):

    # creates files with the features by category (missing first, middle, end segment, etc)
    feat_miss_first=open('features_missing_first.txt','w')
    feat_miss_first.write("features missing the first segment on azucena :\n")
    for first in feature_missing_first :
        feat_miss_first.write(Features[first].annot)
        if Features[first].type!='gene':
            feat_miss_first.write(';')
            feat_miss_first.write(Features[first].note)
        feat_miss_first.write("\n")
    feat_miss_first.close()
        
    feat_miss_middle=open('features_missing_middle.txt','w')
    feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
    for middle in feature_missing_middle :
        feat_miss_middle.write(Features[middle].annot)
        if Features[middle].type!='gene':
            feat_miss_middle.write(';')
            feat_miss_middle.write(Features[middle].note)
        feat_miss_middle.write("\n")
    feat_miss_middle.close()
       
    feat_miss_last=open('features_missing_last.txt','w')
    feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
    for last in feature_missing_last :
        feat_miss_last.write(Features[last].annot)
        if Features[last].type!='gene':
            feat_miss_last.write(';')
            feat_miss_last.write(Features[last].note)
        feat_miss_last.write("\n")
    feat_miss_last.close()
    
    feat_miss=open('features_missing.txt','w')
    feat_miss.write("\nfeatures missing entirely on azucena :\n")
    for entier in feature_missing_all :
        feat_miss.write(Features[entier].annot)
        if Features[entier].type!='gene':
            feat_miss.write(';')
            feat_miss.write(Features[entier].note)
        feat_miss.write("\n")
    feat_miss.close()

    feat_miss_total=open('features_missing_total.txt','w')
    feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
    for total in feature_missing_total :
        feat_miss_total.write(Features[total].annot)
        if Features[total].type!='gene':
            feat_miss_total.write(';')
            feat_miss_total.write(Features[total].note)
        feat_miss_total.write("\n")
    feat_miss_total.close()
    
    feat_all=open('all_features.txt','w')
    for feat in list_feature:
        feat_all.write(Features[feat].annot)
        if Features[feat].type!='gene':
            feat_all.write(';')
            feat_all.write(Features[feat].note)
        feat_all.write("\n")
    feat_all.close()

    feat_ok=open('features_ok.txt','w')
    feat_ok.write("\nfeatures entirely present in azucena :\n")
    for ok in feature_ok :
        feat_ok.write(Features[ok].annot)
        if Features[ok].type!='gene':
            feat_ok.write(';')
            feat_ok.write(Features[ok].note)
        feat_ok.write("\n")
    feat_ok.close()

    # prints the stats for the features (hypothetical/putative rate, by category)
    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
    total=len(feature_missing_first)
    print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
    total=len(feature_missing_middle)
    print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
    total=len(feature_missing_last)
    print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
    total=len(feature_missing_all)
    print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
    total=len(feature_missing_total)
    print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
    total=len(feature_ok)
    print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
    total=len(list_feature)
    print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
    subprocess.run(command,shell=True)


def gff_detail(list_seg,seg_genome,feat,file_out_detail):
    # outputs each fragment of each feature, with its position on the new genome and its annotation : 
    # loop that goes through all the segments that have the current feature
    # keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the part of the fragment, and resets the first segment present.
    # continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature.
    # etc.
    # at the end of the loop, prints the last part of the fragment.
    size_list=len(list_seg)

    feat_start="empty" # stocks the first segment of the feature
    seg_start="empty" # stocks the first segment of the current part of the feature
    #seg_stop="empty" # stocks the last segment of the current part of the feature

    for i in range(0,size_list):
        if list_seg[i][1:] in seg_genome: # look for a segment present in the genome
            if feat_start=="empty":
                feat_start=list_seg[i]
    
            if seg_start=="empty": # if we dont have a start, take the current segment for the start of the part of the feature
                seg_start=list_seg[i]
                
            #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
                
        else:
            if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
                    
                out_line=write_line(list_seg,i,feat,seg_start,feat_start,seg_genome)
                file_out_detail.write(out_line)
                seg_start="empty"
                #seg_stop="empty"
            
            #else: if the current segment is not in azucena but there is no start, keep looking for a start
                
                
    # print the last piece of the feature in the end
    if list_seg[i][1:] in seg_genome:
        out_line=write_line(list_seg,i+1,feat,seg_start,feat_start,seg_genome)
        file_out_detail.write(out_line)


def gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n):
    # outputs each feature once, from the first to the last segment present on the new genome and its annotation (if the size is ok) :
    bad_size=0
    absent_features=0
    if (first_seg!='0') & (last_seg!='0'):

        start2=get_start_2(first_seg,seg_genome,feat)
        stop2=get_stop_2(last_seg,seg_genome,feat)
        size_on_genome=int(stop2)-int(start2)+1
        size_diff=abs(size_on_genome-Features[feat].size)

        if not ((size_on_genome>Features[feat].size*n) | (size_on_genome<Features[feat].size/n)) : # si le nouveau gene est pas 2x plus grand ou plus petit que l'original
            chr=seg_genome[first_seg][0]
            strand=list_seg[0][0:1]

            start2=get_start_2(first_seg,seg_genome,feat)
            stop2=get_stop_2(last_seg,seg_genome,feat)
            size_on_genome=int(stop2)-int(start2)+1
            size_diff=abs(size_on_genome-Features[feat].size)

            nb_variants=0
            for segment in list_seg:
                if segment[1:] not in seg_genome:
                    nb_variants+=1

            annotation=Features[feat].annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(nb_variants)

            line=chr+"  graph_gff   "+Features[feat].type+" "+str(get_start_2(first_seg,seg_genome,feat))+" "+str(get_stop_2(last_seg,seg_genome,feat))+"   .   "+strand+"   .   "+annotation+"\n"
            file_out.write(line)
        else:
            bad_size+=1
    else :
        absent_features+=1
    
    return list([bad_size,absent_features])


# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
    
    print("generation of the genome's gff ")
    
    # create a dictionnary with the positions of the segments on the genome to transfer on
    seg_genome={}
    bed=open(pos_seg,'r')
    lines=bed.readlines()
    for line in lines:
        line=line.split()
        if line[3][0:1]=='>':
            strand='+'
        elif line[3][0:1]=='<':
            strand='-'
        else:
            strand=''
        seg=line[3][1:]
        ch=line[0]
        start=line[1]
        stop=line[2]
        seg_genome[seg]=list([ch,start,stop,strand])
    bed.close()

    
    # create a dictionnary with the sequences of the segments of the graph and a dictionary with the paths from the gfa. each genome name gives the list of its segments in order
    file_gfa=open(gfa,'r')
    lines_gfa=file_gfa.readlines()
    seg_seq={}
    paths={}
    for line in lines_gfa:
        line=line.split()
        if (line[0]=="S"):
            seg_id='s'+line[1]
            seg_seq[seg_id]=line[2]

        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"):
            
            path=line[6].replace(">",";>")
            path=path.replace("<",";<").split(';')
            list_path=list()
            for segment in path:
                if segment[0:1]=='>':
                    list_path.append('+s'+segment[1:])
                elif segment[0:1]=='<':
                    list_path.append('-s'+segment[1:])
                else:
                    list_path.append('s'+segment[1:])

                #couple=['s'+segment[1:],segment[0:1]]
                ##list_path.append(couple)
                #list_path.append('s'+segment[1:])
 
            paths[line[3]]=list_path


    file_out_detail = open("azucena_chr10_detail.gff", 'w')
    file_out=open(out,'w')
    file_out2 = open("segments_manquants.txt", 'w')

    diff_list=list()
    bad_size=0
    absent_features=0

    stats=True
  
    # get the list of all the features to transfer from the gff
    gff=open(gff,'r')
    list_feature=list()
    lines=gff.readlines()
    for line in lines:
        id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
        if id not in list_feature:
            list_feature.append(id)
    gff.close() 
    
    # create objects for stats on how many segments are absent in azucena, their average length, etc
    if stats==True:
        seg_first=list()
        seg_middle=list()
        seg_last=list()
        seg_total_abs=list()
        seg_ok=list()
        seg_entier=list()
        seg_total=list()

        feature_missing_first=list()
        feature_missing_middle=list()
        feature_missing_last=list()
        feature_missing_all=list()
        feature_missing_total=list()
        feature_total=list()
        feature_ok=list()


    # for each feature, get list of the segments where it is and the first and last segment of the feature on the genome

    for feat in list_feature:
        list_seg=Features[feat].segments_list

        first_seg="0"
        for segment in list_seg:
            if segment[1:] in seg_genome:
                first_seg=segment[1:]
                break
        last_seg="0"
        for segment in reversed(list_seg):
            if segment[1:] in seg_genome:
                last_seg=segment[1:]
                break


        # stats on how many segments are absent in azucena, their average length, etc
        

        # for the segments

        # segment missing that is at the start of a feature - seg_first
        # segment missing that is at the end of a feature - seg_last
        # segment missing that is in the middle of a feature - seg_middle
        # segment missing that contains the entire feature - seg_entier
        # total number of genes missing - seg_total_abs
        # segments not missing - seg_ok
        # segments missing of not - seg_total

        if (stats==True):

            for segment in list_seg: # counts the segments in each category
                seg_len=Segments[segment[1:]].size
                if segment[1:] not in seg_genome: # the segment is missing
                    seg_total.append(seg_len)
                    if segment==list_seg[0]: # the segment is the first of the feature
                        if segment==list_seg[-1]: # the segment is the last of the feature
                            seg_entier.append(seg_len) # the segment is the first and the last, so it contains the entire feature
                        else: 
                            seg_first.append(seg_len) 
                    elif segment==list_seg[-1]: # the segment is the last of the feature
                        seg_last.append(seg_len)
                    else:
                        seg_middle.append(seg_len)
                else: # the segment is present on the genome
                    seg_ok.append(seg_len)

        # for the features : 
        # the fist segment of the feature is missing - feature_missing_first
        # the last segment of the feature is missing - feature_missing_last
        # at least one middle segment of the feature is missing - feature_missing_middle
        # the entire feature is missing ->feature_missing_all
        # at least one segment is missing first, last, or middle) - feature_missing_total
        # no segment is missing, the feature is complete - feature_ok  
        # total number of features, with missing segments or not - feature_total

        if (stats==True):
            feature_total.append(feat)

            if (first_seg=='0') & (last_seg=='0') : # no segment of the feature is in the genome, the feature is missing entirely
                feature_missing_all.append(feat)
            elif first_seg != list_seg[0][1:]: # the first segment is missing 
                feature_missing_first.append(feat)
            elif last_seg!=list_seg[-1][1:]: # the last segment is missing
                feature_missing_last.append(feat)

            # go through all the segments, check if some are missing in the middle of the feature
            elif (len(list_seg)!=1) & (feat not in feature_missing_all): # to access the second to last element
                for segment in list_seg[1-(len(list_seg)-2)]:
                    if segment[1:] not in seg_genome:
                        feature_missing_middle.append(feat)
                        break


            # go through the segments, to see if one is missing anywhere on the feature
            for segment in list_seg:
               if segment[1:] not in seg_genome:
                   if feat not in feature_missing_total:
                        feature_missing_total.append(feat)
                        break

            # if the feature doesnt have a missing segment, it is complete.         ADD THE PATH CHECK !!
            if feat not in feature_missing_total:
                feature_ok.append(feat)


        # outputs each fragment of each feature, with its position on the new genome and its annotation : 
        gff_detail(list_seg,seg_genome,feat,file_out_detail)
        
        # outputs each feature once, from the first to the last segment present on the new genome and its annotation (if the size is ok) :
        n=2
        result=gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n)
        bad_size+=result[0]
        absent_features+=result[1]
        
        # outputs the detail of the missing fragments for each feature :
        dif_list=diff_list+gff_missing(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq)
        diff_list=dif_list
        #print(feat,"ok")
        #gff_missing(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq)

    
    file_out_detail.close()
    file_out.close()
    file_out2.close()
    
    # print stats
    from statistics import median, mean
    if stats==True:
        print(len(Features)-(bad_size+absent_features),"out of",len(Features),"features are transfered.")
        print(bad_size,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
        print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
        print("average length difference of the transfered genes : ",sum(diff_list)/len(diff_list))

        stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok)

        # prints the stats for the segments
        #print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
        #print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
        #print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
        #print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
        #print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
        #print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
    

def gff_missing(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq):
    # outputs the detail of the missing fragments for each feature :
    diff_list=list()
    # get the list of the segments in a feature that are missing in the genome
    segments_missing=list()
    for segment in list_seg:
            if segment[1:] not in seg_genome:
                segments_missing.append(Segments[segment[1:]])

    # if there are missing segments, print them
    if (first_seg!='0') & (last_seg!='0'):
        var=0 # count variations, to see if there is any

        feature=Features[feat]
        feat_start=feature.start

        # get the lengths of the feature, on the original genome and on the new one
        start2=get_start_2(first_seg,seg_genome,feat)
        stop2=get_stop_2(last_seg,seg_genome,feat)
        size_on_genome=int(stop2)-int(start2)+1
        size_diff=size_on_genome-feature.size
        diff_list.append(abs(int(size_diff))) # to have stats on the size of the segments missing
        line=feat+" : \nlength on the original genome = "+str(feature.size)+"\nlength difference (length in the new genome-length in the original genome) = "+str(size_diff)+"\n"
        file_out2.write(line)
        
        # find the path in azucena. 
        first_strand=seg_genome[first_seg][3]
        first_seg_stranded=first_strand+first_seg
        last_strand=seg_genome[last_seg][3]
        last_seg_stranded=last_strand+last_seg
        id_first_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(first_seg_stranded))
        id_last_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(last_seg_stranded))
        list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][id_first_seg:id_last_seg+1]
        list_segfeat_nb=Features[feat].segments_list

        # get list segments present in azucena.
        # for segment in list_seg:

        # boucles parcourir les deux listes.
        i=0
        j=0
        last=''
        last_taille=0
        last_seq=''
        last_start=''
        start=0
        pos=''

        # check if the gene is inverted :
        # if all the segments in common (no strand) have the oposite strand, change the strand (or ignore it ?).
        # the list from the paths are stranded. 
        # first get the list of the segments in common without the strand
        # then compare the strands
        # if all the strands are different, print 'there is an inversion'.
        # else print nothing. then ignore the strand.

        # check if there is an inversion
        if detect_inversion(list_segfeat_nb,list_segfeat_azu):
            line="inversion detected\n"
            file_out2.write(line)

        # remove the strands from the lists of segments :
        list_segfeat_nb=remove_strand(list_segfeat_nb)
        list_segfeat_azu=remove_strand(list_segfeat_azu)

        # detect and print variations ignoring the strands
        while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
        #for i in range(0,len(list_segfeat_nb)):
            if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
                if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
                    if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
                        # is both segments are absent in the other genome, its a substitution

                        # print if we had an insertion or deletion running
                        if last=='insertion':
                            pos=int(Segments[last_start].start)-int(feat_start)+1
                            line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
                            file_out2.write(line)
                            var+=1
                        elif last=='deletion':
                            if start==1:
                                pos=int(Segments[last_start].start)-int(feat_start)+1
                                line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
                                start=0
                            else:
                                pos=int(Segments[last_start].start)-int(feat_start)+1
                                line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
                            file_out2.write(line)
                            var+=1
                        last=''
                        last_taille=0
                        last_seq=''

                        # print the substitution
                        # substitution of segment list_segfeat_nb[i][1:] by segment list_segfeat_azu[j][1:]
                        if i==0:
                            pos=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
                            line='substitution at the start of the feature of '+list_segfeat_nb[i]+' '+seg_seq[list_segfeat_nb[i]]+" by "+list_segfeat_azu[j]+' '+seg_seq[list_segfeat_azu[j]]+" at the position "+str(pos)+'\n'
                        else:
                            pos=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
                            line='substitution of '+list_segfeat_nb[i]+' '+seg_seq[list_segfeat_nb[i]]+" by "+list_segfeat_azu[j]+' '+seg_seq[list_segfeat_azu[j]]+" at the position "+str(pos)+'\n'
                        file_out2.write(line)
                        var+=1
                        # go to the next segments on the paths
                        i+=1
                        j+=1

                    else: # azu segment not in nb, but nb segment in azu : insertion
                        if last=='deletion':
                            if start==1:
                                pos=int(Segments[last_start].start)-int(feat_start)+1
                                line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
                                start=0
                            else:
                                pos=int(Segments[last_start].start)-int(feat_start)+1
                                line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
                            file_out2.write(line)
                            var+=1
                            last=''
                            last_taille=0
                            last_start=''
                            last_seq=''
                        if last=='insertion':
                            last_taille+=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
                            last_seq=last_seq+seg_seq[list_segfeat_azu[j]]
                        else:
                            last='insertion'
                            last_seq=seg_seq[list_segfeat_azu[j]]
                            last_taille=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
                            last_start=list_segfeat_nb[i]
                        j+=1
                        
                elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion
                    if last=='insertion':
                        pos=int(Segments[last_start].start)-int(feat_start)+1
                        line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
                        file_out2.write(line)
                        var+=1
                        last=''
                        last_start=''
                        last_taille=0
                        last_seq=''
                    if last=='deletion':
                        last_taille+=Segments[list_segfeat_nb[i]].size
                        last_seq=last_seq+seg_seq[list_segfeat_nb[i]]
                    else:
                        last='deletion'
                        last_start=list_segfeat_nb[i]
                        last_taille=Segments[list_segfeat_nb[i]].size
                        last_seq=seg_seq[list_segfeat_nb[i]]
                        if i==0:
                            start=1
                    i+=1
                    
                else : # idk yet.
                    line="weird order change"
                    file_out2.write(line)
                    var+=1
                    i+=1
                    j+=1

            else: # segment present in both. print the running indel if there is one
                #print("segment",list_segfeat_nb[i][1:],'ok')
                if last=='insertion':
                    pos=int(Segments[last_start].start)-int(feat_start)+1
                    line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
                    file_out2.write(line)
                    var+=1
                elif last=='deletion':
                    if start==1:
                        pos=int(Segments[last_start].start)-int(feat_start)+1
                        line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
                        start=0
                    else:
                        pos=int(Segments[last_start].start)-int(feat_start)+1
                        line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
                    file_out2.write(line)
                    var+=1
                last=''
                last_taille=0
                last_start=''
                last_seq=''
                i+=1
                j+=1
            
        # finish printing the current indel
        if last=='insertion':
            pos=int(Segments[last_start].start)-int(feat_start)+1
            line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
            file_out2.write(line)
            var+=1
        elif last=='deletion':
            if start==1:
                pos=int(Segments[last_start].start)-int(feat_start)+1
                line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
                start=0
            else:
                pos=int(Segments[last_start].start)-int(feat_start)+1
                line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
            file_out2.write(line)
            var+=1

        # see if the end is missing for one of the two genomes
        # si i a la taille max, peut etre que j aussi est arrivé à la fin, si j+1 est trop grand, donc si j==len(list_segfeat_azu)-1
        # si i et j taille max (ou i et j+1), c bon les deux sont arrivés à la fin (je crois).
        # si i taille max et pas j, insertion. si j taille max et pas i, délétion. 


        if not((i>=len(list_segfeat_nb)-1) & (j>=len(list_segfeat_azu)-1)):
            if (i<len(list_segfeat_nb)-1):
                if not (j>=len(list_segfeat_azu)-1):
                    print('pb')
                # length of the deletion = position of the end of the feature - position of the start of the deletion
                length=feature.stop-Segments[list_segfeat_nb[i]].start
                line="deletion at the end of length "+str(length)+"\n"
                file_out2.write(line)
                var+=1
            elif (j<len(list_segfeat_azu)-1):
                file_out2.write("insertion at the end\n")
                var+=1


        # au début et à la fin, si on a une délétion, on peut que donner la taille du segment, pas la taille de la délétion. 
        # dans le segment ou la feature, garder l'info de où commence le gène sur le segment..... 
        # taille de la délétion = (début segment fin + position de fin) - début last_seg
        # ici last seg c list_nb[i], et segment fin c'est list_nb[-1]

        if var==0:
            line=feat+" : no variation\n"
            file_out2.write(line)
    else:
        line=feat+" : feature entirely absent.\n"
        file_out2.write(line)
    file_out2.write('\n')

    return diff_list


def remove_strand(list_seg):
    list_unstrand=list()
    for seg_stranded in list_seg:
        list_unstrand.append(seg_stranded[1:])
    return list_unstrand

def detect_inversion(list_1,list_2):
    list_1_unstrand=remove_strand(list_1)
    list_2_unstrand=remove_strand(list_2)

    # get the list of segments in common
    seg_common=list()
    for seg in list_1_unstrand:
        if seg in list_2_unstrand:
            seg_common.append(seg)
    
    # for each segment in common, check if the strand is the same. check index in list unstranded to get the segment in list stranded
    same_strand_count=0
    for seg in seg_common:
        index_1=list_1_unstrand.index(seg)
        index_2=list_2_unstrand.index(seg)
        if list_1[index_1]==list_2[index_2]:
            same_strand_count+=1

    # if there is less than 10% of strand difference among the common segments, return False, no inversion 
    if len(seg_common)-same_strand_count<len(seg_common)/10:
        return False
    else:
        return True