Skip to content
Snippets Groups Projects
Functions.py 37.2 KiB
Newer Older

# functions to create the segments and the features

NMarthe's avatar
NMarthe committed
# create a segment with its first feature
def init_seg(line,feature):
    line=line.split()
    seg_id=line[3][1:]
    chr=line[0]
    start=line[1]
    stop=line[2]
    
NMarthe's avatar
NMarthe committed
    # add the current feature to the list of features that are on the segment
    feature_stranded=feature_strand+feature
    feat=list()
    feat.append(feature_stranded)
    
NMarthe's avatar
NMarthe committed
    # create the segment
    Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)


NMarthe's avatar
NMarthe committed
# create a feature with the segment its on
    feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
    type=line[6]
    annot=line[12]
NMarthe's avatar
NMarthe committed
    # add the current segment to the list of segments that have the feature
    seg_stranded=strand+seg
    segments_list=list()
    segments_list.append(seg_stranded)

NMarthe's avatar
NMarthe committed
    # if the current feature has a parent, add the current feature in the childs list of its parent
    if annot.split(";")[1].split("=")[0]=="Parent":
NMarthe's avatar
NMarthe committed
    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    
    elif annot.split(";")[2].split("=")[0]=="Parent":
NMarthe's avatar
NMarthe committed
    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    else: parent=""

NMarthe's avatar
NMarthe committed
    # create the feature
    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)


NMarthe's avatar
NMarthe committed
# add a feature to an existing segment
def add_feature(seg,new_feature,strand):
    new_feature_stranded=strand+new_feature
    if new_feature_stranded not in Segments[seg].features:
        Segments[seg].features.append(new_feature_stranded)
    
NMarthe's avatar
NMarthe committed

# add a child feature to an existing feature
NMarthe's avatar
NMarthe committed
    if feat in Features.keys(): # if the parent feature exists
        if new_child not in Features[feat].childs:
            Features[feat].childs.append(new_child)
    
    
NMarthe's avatar
NMarthe committed
# add a segment to an existing feature
def add_seg(feat,new_seg,strand):
    seg_stranded=strand+new_seg
    if seg_stranded not in Features[feat].segments_list:
        Features[feat].segments_list.append(seg_stranded)

NMarthe's avatar
NMarthe committed

# create a note for the child features that do not have annotation. 
NMarthe's avatar
NMarthe committed
    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
NMarthe's avatar
NMarthe committed
    if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
        feat.note=feat.annot.split(';')[-1]
NMarthe's avatar
NMarthe committed
    else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
        # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
NMarthe's avatar
NMarthe committed
        annot_found=False
        while annot_found==False:
            if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
                note=Features[curent].annot.split(';')[-1]
                feat.note=note
                annot_found=True
            else: # if we didn't find the gene, we go back to the current feature's parent until we find it
                curent=Features[Features[curent].parent].id

NMarthe's avatar
NMarthe committed

# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):    

    global Features
    global Segments
    Segments={}
    Features={}
    
NMarthe's avatar
NMarthe committed
    # open the file with the intersect between the segments and the gff
    file = open(intersect_path, 'r')
    lines=file.readlines()
    
    for line in lines:
        # get the ids for the dictionnaries' keys
        feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
        segment_id=line.split()[3][1:]
    
        if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
            init_feature(line)
NMarthe's avatar
NMarthe committed
        else: # if it exists, add the current segment to the list of segments that have the existing feature
            add_seg(feature_id,segment_id,strand)
    
        if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
            init_seg(line, feature_id)
        
NMarthe's avatar
NMarthe committed
        else: # if it exists, add the current feature to the list of features on the existing segment
NMarthe's avatar
NMarthe committed
    # for all the features, add the note (information on the function of the feature)
    for feat_id in Features:
        set_note(feat_id)

# functions to generate the graph's gff from the segments and features created with create_seg_feat

# get the feature's start position on the segment
def get_start(seg,feat):
    s=Segments[seg]
    f=Features[feat]
    if s.start>f.start:
        result=1
    else:
        result=f.start-s.start
    return result

NMarthe's avatar
NMarthe committed

# get the feature's stop position on the segment
def get_stop(seg,feat):
    s=Segments[seg]
    f=Features[feat]    
    if s.stop<f.stop:
        result=s.size
    else:
        result=f.stop-s.start
    return result

NMarthe's avatar
NMarthe committed

# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
    print("generation of the graph's gff")
    file_out = open(file_out, 'w')

    for segment in Segments:
        # get the list of the features on the segment
        features_seg=Segments[segment].features
        
        # go through all these features, and print the gff line for each
        for feature_stranded in features_seg:
            strand=feature_stranded[0:1]
            feature=feature_stranded[1:]
            segment_stranded=strand+segment
            type=Features[feature].type
            start=get_start(segment,feature)
            stop=get_stop(segment,feature)
                
            # get the rank and the total number of ocurrences for the feature
            rank=str(Features[feature].segments_list.index(segment_stranded)+1)
            total=str(len(Features[feature].segments_list))

            # create the annotation with the rank information
            annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total

NMarthe's avatar
NMarthe committed
            # write the gff line in the output file
            line=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
            file_out.write(line)
# functions to generate a genome's gff from the graph's gff

# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
NMarthe's avatar
NMarthe committed
def get_start_2(seg_start, seg_genome, feat_id):
    s_start=seg_genome[seg_start][1]
    f_start=get_start(seg_start,feat_id)
    
    start=int(s_start)+int(f_start)
    return start

NMarthe's avatar
NMarthe committed

# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
NMarthe's avatar
NMarthe committed
def get_stop_2(seg_stop, seg_genome, feat_id):
    s_start=seg_genome[seg_stop][1]
NMarthe's avatar
NMarthe committed

# get the position of a part of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,feat_start,feature,seg_genome):
NMarthe's avatar
NMarthe committed
    # start position of the entire feature on the reference
    start_gene_start=Segments[feat_start].start+get_start(feat_start,feature.id)-1
    
    # start position of the piece of the feature on the reference
    start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
    
    # start and stop position of the feature on azucena, to get the length of the piece of the feature
NMarthe's avatar
NMarthe committed
    start_azu=get_start_2(seg_start,seg_genome,feature.id) 
    stop_azu=get_stop_2(seg_stop,seg_genome,feature.id)
    
    # start position of the piece of the feature on the complete feature
    start_gene=int(start_first_seg)-int(start_gene_start)+1
    # stop position : start+length
    stop_gene=start_gene+(stop_azu-start_azu)
    
    position=";position="+str(start_gene)+"-"+str(stop_gene)
    #position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
    return position

NMarthe's avatar
NMarthe committed

# get the proportion of a part of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_genome,feature):
    start_azu=get_start_2(seg_start,seg_genome,feature.id) # start position of the feature on azucena
    stop_azu=get_stop_2(seg_stop,seg_genome,feature.id) # stop position of the feature on azucena
    
    proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
    #proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
    return proportion

NMarthe's avatar
NMarthe committed

# returns the gff line to write in the output file
NMarthe's avatar
NMarthe committed
def write_line(list_seg,i,feat,seg_start,feat_start,seg_genome):
    seg_stop=list_seg[i-1][1:]
    feature=Features[feat]
NMarthe's avatar
NMarthe committed
    chr=seg_genome[seg_start][0]
    strand=list_seg[i-1][0:1]
    
    # start and stop position of the feature on azucena
NMarthe's avatar
NMarthe committed
    start_azu=get_start_2(seg_start,seg_genome,feature.id) 
    stop_azu=get_stop_2(seg_stop,seg_genome,feature.id)
NMarthe's avatar
NMarthe committed
    proportion=get_proportion(seg_start,seg_stop,seg_genome,feature)
    position=get_position(seg_start,seg_stop,feat_start,feature,seg_genome)
    
    annotation=feature.annot+proportion+position
    
    out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
    return out_line


# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
    print("generation of the genome's gff ")
NMarthe's avatar
NMarthe committed
    # create a dictionnary with the positions of the segments on the genome to transfer on
    seg_genome={}
    bed=open(pos_seg,'r')
    lines=bed.readlines()
    for line in lines:
        line=line.split()
NMarthe's avatar
NMarthe committed
        seg_genome[s_id]=list([ch,start,stop])
    # create a dictionary with the paths from the gfa. each genome name gives the list of its segments in order
    file_gfa=open(gfa,'r')
    lines_gfa=file_gfa.readlines()
    paths={}
    for line in lines_gfa:
        line=line.split()
        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"):
            
            path=line[6].replace(">",";>")
            path=path.replace("<",";<").split(';')
            list_path=list()
            for segment in path:
                if segment[0:1]=='>':
                    list_path.append('+s'+segment[1:])
                elif segment[0:1=='<']:
                    list_path.append('-s'+segment[1:])
                else:
                    list_path.append('s'+segment[1:])

                #couple=['s'+segment[1:],segment[0:1]]
                ##list_path.append(couple)
                #list_path.append('s'+segment[1:])
 
            paths[line[3]]=list_path


    file_out_detail = open("azucena_chr10_detail.gff", 'w')
    file_out=open(out,'w')
    file_out2 = open("segments_manquants.txt", 'w')

NMarthe's avatar
NMarthe committed
    # get the list of all the features to transfer from the gff
    list_feature=list()
    for line in lines:
        id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
        if id not in list_feature:
            list_feature.append(id)
    
NMarthe's avatar
NMarthe committed
    # create objects for stats on how many segments are absent in azucena, their average length, etc
    if stats==True:
        seg_first=list()
        seg_middle=list()
        seg_last=list()
        feature_missing_first=list()
        feature_missing_middle=list()
        feature_missing_last=list()
        feature_missing_all=list()
        feature_missing_total=list()
        feature_total=list()
        feature_ok=list()

        #size_seg_missing=list()

    
    # for each feature, get list of the segments where it is and the first and last segment of the feature on the genome
    for feat in list_feature:
        list_seg=Features[feat].segments_list
        size_list=len(list_seg)


        first_seg="0"
        for segment in list_seg:
NMarthe's avatar
NMarthe committed
            if segment[1:] in seg_genome:
                break
        last_seg="0"
        for segment in reversed(list_seg):
NMarthe's avatar
NMarthe committed
            if segment[1:] in seg_genome:
NMarthe's avatar
NMarthe committed
        segments_missing=list()
        # stats on how many segments are absent in azucena, their average length, etc
NMarthe's avatar
NMarthe committed

        # segment missing that is at the start of a feature - seg_first
        # segment missing that is at the end of a feature - seg_last
        # segment missing that is in the middle of a feature - seg_middle
        # segment missing that contains the entire feature - seg_entier
        # total number of genes missing - seg_total_abs
        # segments not missing - seg_ok
        # segments missing of not - seg_total
NMarthe's avatar
NMarthe committed
            for segment in list_seg: # counts the segments in each category
NMarthe's avatar
NMarthe committed
                if segment[1:] not in seg_genome: # the segment is missing
                    segments_missing.append(Segments[segment[1:]])
NMarthe's avatar
NMarthe committed
                    if segment==list_seg[0]: # the segment is the first of the feature
                        if segment==list_seg[-1]: # the segment is the last of the feature
                            seg_entier.append(seg_len) # the segment is the first and the last, so it contains the entire feature
NMarthe's avatar
NMarthe committed
                            seg_first.append(seg_len) 
                    elif segment==list_seg[-1]: # the segment is the last of the feature
                        seg_last.append(seg_len)
                    else:
                        seg_middle.append(seg_len)
NMarthe's avatar
NMarthe committed
                else: # the segment is present on the genome
NMarthe's avatar
NMarthe committed
        # the fist segment of the feature is missing - feature_missing_first
        # the last segment of the feature is missing - feature_missing_last
        # at least one middle segment of the feature is missing - feature_missing_middle
        # the entire feature is missing ->feature_missing_all
        # at least one segment is missing first, last, or middle) - feature_missing_total
        # no segment is missing, the feature is complete - feature_ok  
        # total number of features, with missing segments or not - feature_total

        if (stats==True):
            feature_total.append(feat)
NMarthe's avatar
NMarthe committed

            if (first_seg=='0') & (last_seg=='0') : # no segment of the feature is in the genome, the feature is missing entirely
                feature_missing_all.append(feat)
            elif first_seg[1:] != list_seg[0][1:]: # the first segment is missing 
                feature_missing_first.append(feat)
            elif last_seg[1:]!=list_seg[-1][1:]: # the last segment is missing
                feature_missing_last.append(feat)

NMarthe's avatar
NMarthe committed
            # go through all the segments, check if some are missing in the middle of the feature
            elif (len(list_seg)!=1) & (feat not in feature_missing_all): # to access the second to last element
                for segment in list_seg[1-(len(list_seg)-2)]:
NMarthe's avatar
NMarthe committed
                    if segment not in seg_genome:
                        feature_missing_middle.append(feat)
                        break


NMarthe's avatar
NMarthe committed
            # go through the segments, to see if one is missing anywhere on the feature
            for segment in list_seg:
NMarthe's avatar
NMarthe committed
               if segment[1:] not in seg_genome:
                   if feat not in feature_missing_total:
                        feature_missing_total.append(feat)
                        break

NMarthe's avatar
NMarthe committed
            # if the feature doesnt have a missing segment, it is complete.         ADD THE PATH CHECK !!
            if feat not in feature_missing_total:
                feature_ok.append(feat)


        # outputs each fragment of each feature, with its position on the new genome and its annotation : 
        # loop that goes through all the segments that have the current feature
NMarthe's avatar
NMarthe committed
        # keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the part of the fragment, and resets the first segment present.
        # continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature.
        # etc.
        # at the end of the loop, prints the last part of the fragment.
        
        feat_start="empty" # stocks the first segment of the feature
        seg_start="empty" # stocks the first segment of the current part of the feature
        #seg_stop="empty" # stocks the last segment of the current part of the feature
NMarthe's avatar
NMarthe committed
            if list_seg[i][1:] in seg_genome: # look for a segment present in the genome
                if feat_start=="empty":
                    feat_start=list_seg[i][1:]
NMarthe's avatar
NMarthe committed
                if seg_start=="empty": # if we dont have a start, take the current segment for the start of the part of the feature
                    seg_start=list_seg[i][1:]
                
                #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
                
            else:
                if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
                    
NMarthe's avatar
NMarthe committed
                    out_line=write_line(list_seg,i,feat,seg_start,feat_start,seg_genome)
NMarthe's avatar
NMarthe committed
                    #seg_stop="empty"
            
                #else: if the current segment is not in azucena but there is no start, keep looking for a start
                
                
        # print the last piece of the feature in the end
NMarthe's avatar
NMarthe committed
        if list_seg[i][1:] in seg_genome:
            out_line=write_line(list_seg,i+1,feat,seg_start,feat_start,seg_genome)
NMarthe's avatar
NMarthe committed
        # outputs each feature once, from the first to the last segment present on the new genome and its annotation :      ADD THE SIZE PARAMETER
        if (first_seg!='0') & (last_seg!='0'):
            start2=get_start_2(first_seg[1:],seg_genome,feat)
            stop2=get_stop_2(last_seg[1:],seg_genome,feat)
            size_on_genome=int(stop2)-int(start2)+1
            size_diff=abs(size_on_genome-Features[feat].size)

            if not ((size_on_genome>Features[feat].size*2) | (size_on_genome<Features[feat].size/2)) :
                start2=get_start_2(first_seg[1:],seg_genome,feat)
                stop2=get_stop_2(last_seg[1:],seg_genome,feat)
                size_on_genome=int(stop2)-int(start2)+1
                size_diff=abs(size_on_genome-Features[feat].size)

                nb_variants=0
                for segment in list_seg:
                    if segment[1:] not in seg_genome:
                        nb_variants+=1

                annotation=Features[feat].annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(nb_variants)

                line=chr+"  graph_gff   "+Features[feat].type+" "+str(get_start_2(first_seg[1:],seg_genome,feat))+" "+str(get_stop_2(last_seg[1:],seg_genome,feat))+"   .   "+strand+"   .   "+annotation+"\n"
                file_out.write(line)
            else:
                bad_size+=1
        else :
            absent_features+=1


        if (first_seg!='0') & (last_seg!='0'):
            # find the path in azucena. 
            #print(list.index(nom)
            id_first_seg=paths["CM020642.1_Azucena_chromosome10"].index(first_seg)
            id_last_seg=paths["CM020642.1_Azucena_chromosome10"].index(last_seg)
            list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][1546:1634]
            list_segfeat_nb=Features[feat].segments_list

            debase=False
            if debase==True:
                # debut
                if list_segfeat_nb.index(first_seg)==0:
                    print("le debut c bon")
                else:
                    print("manque",list_segfeat_nb.index(first_seg)+1,"bout au début")

                # milieu
                for i in range(1,len(list_segfeat_nb)):
                    if list_segfeat_nb[i] not in list_segfeat_azu:
                        print("manque un bout au milieu",list_segfeat_nb[i])

                # fin
                if list_segfeat_nb.index(last_seg)==(len(list_segfeat_nb)):
                    print("la fin c bon")
                else:
                    print("manque",len(list_segfeat_nb)-list_segfeat_nb.index(last_seg),"bout à la fin")

            # boucles parcourir les deux listes.
            donei=False
            donej=False
            i=0
            j=0
            last=''
            last_taille=0
            while ((donei==False) & (donej==False)):
            #for i in range(0,len(list_segfeat_nb)):
                if list_segfeat_nb[i] != list_segfeat_azu[j]: # si on a une différence entre les deux listes : un est pas chez l'autre.
                    if list_segfeat_azu[j] not in list_segfeat_nb: # seg azu pas chez nb
                        if list_segfeat_nb[i] not in list_segfeat_azu: # seg nb pas dans azu
                            # si aucun des deux est dans l'autre, on a une substitution.

                            if last=='insertion':
                                print('insertion de taille',last_taille)
                            elif last=='deletion':
                                print('deletion de taille',last_taille)
                            last=''
                            last_taille=0

                            print('substitution de',list_segfeat_nb[i][1:],'de taille',Segments[list_segfeat_nb[i][1:]].size,"par",list_segfeat_azu[j][1:],'de taille',int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1)
                            if i<len(list_segfeat_nb)-1:
                                i+=1
                            else:
                                donei=True
                                #print('i1')
                            if j<len(list_segfeat_azu)-1:
                                j+=1
                            else:
                                donej=True
                                #print('j1')

                        else: # seg azu pas dans nb, mais seg nb dans azu. insertion
                            if last=='insertion':
                                last_taille+=int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1
                            elif last=='deletion':
                                print('deletion de taille',last_taille)
                                last=''
                                last_taille=0
                            else:
                                last='insertion'
                                last_taille=int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1
                            #print('insertion de',list_segfeat_azu[j][1:],'de taille',int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1)
                            if j<len(list_segfeat_azu)-1:
                                j+=1
                            else:
                                donej=True
                                #print('j2')
                        
                    elif list_segfeat_nb[i] not in list_segfeat_azu: # seg nb pas chez azu, mais seg azu dans nb. deletion

                        if last=='deletion':
                                last_taille+=Segments[list_segfeat_nb[i][1:]].size
                        elif last=='insertion':
                            print('insertion de taille',last_taille)
                            last=''
                            last_taille=0
                        else:
                            last='deletion'
                            last_taille=Segments[list_segfeat_nb[i][1:]].size
                        #print('deletion de',list_segfeat_nb[i][1:],'de taille',Segments[list_segfeat_nb[i][1:]].size)
                        if i<len(list_segfeat_nb)-1:
                                i+=1
                        else:
                            donei=True
                            #print('i2')

                    
                    else : 
                        print("changement d'ordre bizarre")
                        if i<len(list_segfeat_nb)-1:
                            i+=1
                        else:
                            donei=True
                            #print('i3')
                        if j<len(list_segfeat_azu)-1:
                            j+=1
                        else:
                            donej=True
                            #print('j3')
                else:
                    #print("segment",list_segfeat_nb[i][1:],'ok')
                    if last=='insertion':
                        print('insertion de taille',last_taille)
                    elif last=='deletion':
                        print('deletion de taille',last_taille)

                    last=''
                    last_taille=0
                    if i<len(list_segfeat_nb)-1:
                        i+=1
                    else:
                        donei=True
                        #print('i4')
                    if j<len(list_segfeat_azu)-1:
                        j+=1
                    else:
                        donej=True
                        #print('j4')
            
            # voir s'il manque la fin pour un des deux
            if last=='insertion':
                print('insertion de taille',last_taille)
            elif last=='deletion':
                print('deletion de taille',last_taille)

            if (i==len(list_segfeat_nb)-1) & (donei==True):
                if (j!=len(list_segfeat_azu)-1) & (donej==False): # taille à revoir
                    print('insertion de',list_segfeat_azu[j+1:len(list_segfeat_azu)-1][1:],'à la fin, de taille',int(seg_genome[list_segfeat_azu[j][1:]][2])-int(seg_genome[list_segfeat_azu[j][1:]][1])+1,'(le segment, pas l\'insertion)')
            else:
                print('deletion de',list_segfeat_nb[i:],'à la fin, de taille',Segments[list_segfeat_nb[i][1:]].size,'(le segment, pas la deletion)') # taille à revoir

            # au début et à la fin, si on a une délétion, on peut que donner la taille du segment, pas la taille de la délétion. 
            # j'ai aucune idée de comment régler ça simplement...
            # dans le segment ou la feature, garder l'info de où sur le segment ça démarre..... 
                

                                





        
        # outputs the detail of the missing fragments for each feature :

NMarthe's avatar
NMarthe committed
        # get the list of the segments in a feature that are missing in the genome
        for segment in list_seg:
NMarthe's avatar
NMarthe committed
                if segment[1:] not in seg_genome:
                    segments_missing.append(Segments[segment[1:]])

NMarthe's avatar
NMarthe committed
        # if there are missing segments, print them
        if (len(segments_missing)!=0) & (first_seg!='0') & (last_seg!='0'):
            
NMarthe's avatar
NMarthe committed
            # get the lengths of the feature, on the original genome or on the new one
            start2=get_start_2(first_seg[1:],seg_genome,feat)
            stop2=get_stop_2(last_seg[1:],seg_genome,feat)
            size_on_genome=int(stop2)-int(start2)+1
            size_diff=size_on_genome-Features[feat].size
            diff_list.append(abs(int(size_diff))) # to have stats on the size of the segments missing
NMarthe's avatar
NMarthe committed
            line=feat+" : "+str(len(segments_missing))+" variations. length difference (length in the new genome-length in the original genome) : "+str(size_diff)+"\n"
            # length of the original feature : "+str(Features[feat].size)+", length of the feature on this genome : "+str(size_on_genome)+"\n"
            file_out2.write(line)
NMarthe's avatar
NMarthe committed
            # for each segment missing, see where it is on the feature (start, middle, end)
            for segment in segments_missing:
                if (segment.id==Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#premier et pas dernier
NMarthe's avatar
NMarthe committed
                    line="first segment of the feature is missing.\n"
                elif (segment.id==Features[feat].segments_list[-1][1:]) & ((segment.id!=Features[feat].segments_list[0][1:])):#dernier et pas premier
NMarthe's avatar
NMarthe committed
                    line="last segment of the feature is missing.\n"
                elif (segment.id!=Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#ni premier ni dernier
NMarthe's avatar
NMarthe committed
                    line="variation in the middle of the feature, of length "+str(segment.size)+"\n"
                    # check if its a substitution of an indel
                    # if its a substitution ou a deletion, print what it is replacing in the original feature

                    #size_seg_missing.append(segment.size)
NMarthe's avatar
NMarthe committed
                file_out2.write(line)
NMarthe's avatar
NMarthe committed
        elif len(segments_missing)!=0:
            line=feat+": feature entirely absent\n\n"
            file_out2.write(line)
NMarthe's avatar
NMarthe committed
            line=feat+": no variation.\n\n"
            file_out2.write(line)
        


    #import matplotlib.pyplot as plt
    #plt.hist(diff_list, range=(-1000,1000))
    #plt.show()
    #print("segment au millieu taille moyenne : "+str(sum(size_seg_missing)/len(size_seg_missing)))
    

    if stats==True:
        print(len(Features)-(bad_size+absent_features),"out of",len(Features),"features are transfered.")
        print(bad_size,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
        print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
        print("average length difference of the transfered genes : ",sum(diff_list)/len(diff_list))
    
    # print stats
    from statistics import median, mean
    if stats==True:

        # prints the stats for the segments
        #print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
        #print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
        #print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
        #print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
        #print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
        #print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
NMarthe's avatar
NMarthe committed
        # creates files with the features by category (missing first, middle, end segment, etc)
        feat_miss_first=open('features_missing_first.txt','w')
        feat_miss_first.write("features missing the first segment on azucena :\n")
        for first in feature_missing_first :
            feat_miss_first.write(Features[first].annot)
            if Features[first].type!='gene':
                feat_miss_first.write(';')
                feat_miss_first.write(Features[first].note)
            feat_miss_first.write("\n")
        feat_miss_first.close()
        
        feat_miss_middle=open('features_missing_middle.txt','w')
        feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
        for middle in feature_missing_middle :
            feat_miss_middle.write(Features[middle].annot)
            if Features[middle].type!='gene':
                feat_miss_middle.write(';')
                feat_miss_middle.write(Features[middle].note)
            feat_miss_middle.write("\n")
        feat_miss_middle.close()
        
        feat_miss_last=open('features_missing_last.txt','w')
        feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
        for last in feature_missing_last :
            feat_miss_last.write(Features[last].annot)
            if Features[last].type!='gene':
                feat_miss_last.write(';')
                feat_miss_last.write(Features[last].note)
            feat_miss_last.write("\n")
        feat_miss_last.close()
    
        feat_miss=open('features_missing.txt','w')
        feat_miss.write("\nfeatures missing entirely on azucena :\n")
        for entier in feature_missing_all :
            feat_miss.write(Features[entier].annot)
            if Features[entier].type!='gene':
                feat_miss.write(';')
                feat_miss.write(Features[entier].note)
            feat_miss.write("\n")
        feat_miss.close()

        feat_miss_total=open('features_missing_total.txt','w')
        feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
        for total in feature_missing_total :
            feat_miss_total.write(Features[total].annot)
            if Features[total].type!='gene':
                feat_miss_total.write(';')
                feat_miss_total.write(Features[total].note)
            feat_miss_total.write("\n")
        feat_miss_total.close()
    
        feat_all=open('all_features.txt','w')
        for feat in list_feature:
            feat_all.write(Features[feat].annot)
            if Features[feat].type!='gene':
                feat_all.write(';')
                feat_all.write(Features[feat].note)
        feat_ok=open('features_ok.txt','w')
        feat_ok.write("\nfeatures entirely present in azucena :\n")
        for ok in feature_ok :
            feat_ok.write(Features[ok].annot)
            if Features[ok].type!='gene':
                feat_ok.write(';')
                feat_ok.write(Features[ok].note)
            feat_ok.write("\n")
        feat_ok.close()
        # prints the stats for the features (hypothetical/putative rate, by category)
        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
NMarthe's avatar
NMarthe committed
        total=len(feature_missing_first)
        print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
NMarthe's avatar
NMarthe committed
        total=len(feature_missing_middle)
        print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
NMarthe's avatar
NMarthe committed
        total=len(feature_missing_last)
        print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
NMarthe's avatar
NMarthe committed
        total=len(feature_missing_all)
        print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
NMarthe's avatar
NMarthe committed
        total=len(feature_missing_total)
        print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
NMarthe's avatar
NMarthe committed
        total=len(feature_ok)
        print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
NMarthe's avatar
NMarthe committed
        total=len(list_feature)
        print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

        command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
        subprocess.run(command,shell=True)