Functions.py

import ClassSegFeat as Class
import subprocess

# functions to create the segments and the features

# create segment
def init_seg(line,feature):
    line=line.split()
    seg_id=line[3][1:]
    chr=line[0]
    start=line[1]
    stop=line[2]
    
    # add the current feature (stranded) to the list of features that are on the segment
    feature_strand=line[10]
    feature_stranded=feature_strand+feature
    feat=list()
    feat.append(feature_stranded)
    
    Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)


# create feature
def init_feature(line):
    line=line.split()
    feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
    type=line[6]
    annot=line[12]
    chr=line[4]
    start=line[7]
    stop=line[8]
    childs=list()
    
    # add the current segment (stranded) to the list of segments that have the feature
    seg=line[3][1:]
    strand=line[10]
    seg_stranded=strand+seg
    segments_list=list()
    segments_list.append(seg_stranded)

    # if the current feature as a parent, add the feature un the childs list of its parent
    if annot.split(";")[1].split("=")[0]=="Parent":
    # for annotations like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    
    elif annot.split(";")[2].split("=")[0]=="Parent":
    # for annotations like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    else: parent=""

    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)


# add a feature (stranded) to an existing segment
def add_feature(seg,new_feature,strand):
    new_feature_stranded=strand+new_feature
    if new_feature_stranded not in Segments[seg].features:
        Segments[seg].features.append(new_feature_stranded)
    
    
# add a child to an existing feature
def add_child(feat,new_child):
    if feat in Features.keys():
        if new_child not in Features[feat].childs:
            Features[feat].childs.append(new_child)
    
    
# add a segment (stranded) to an existing feature
def add_seg(feat,new_seg,strand):
    seg_stranded=strand+new_seg
    if seg_stranded not in Features[feat].segments_list:
        Features[feat].segments_list.append(seg_stranded)

    
def set_note(id):
    feat=Features[id]
    if feat.type=="gene":
        feat.note=feat.annot.split(';')[-1]
    else:
        curent=feat.parent
        ann=0
        while ann==0:
            if Features[curent].type=="gene":
                fonction=Features[curent].annot.split(';')[-1]
                feat.note=fonction
                ann=1
            else:
                curent=Features[Features[curent].parent].id

# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):    

    global Features
    global Segments
    Segments={}
    Features={}
    
    # open the intersect between the segments and the gff
    file = open(intersect_path, 'r')
    lines=file.readlines()
    
    for line in lines:
        # get the ids for the dictionnaries' keys
        feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
        segment_id=line.split()[3][1:]
    
        if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
            init_feature(line)
            #set_function(feature_id)
        else: # add the current segment to the list of segments that have the existing feature
            strand=line.split()[10]
            add_seg(feature_id,segment_id,strand)
    
        if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
            init_seg(line, feature_id)
        
        else: # add the current feature to the existing segment
            strand=line.split()[10]
            add_feature(segment_id,feature_id,strand)

    for feat_id in Features:
        set_note(feat_id)

    file.close()


# functions to generate the graph's gff from the segments and features created with create_seg_feat

# get the feature's start position on the segment
def get_start(seg,feat):
    s=Segments[seg]
    f=Features[feat]
    if s.start>f.start:
        result=1
    else:
        result=f.start-s.start
    return result

# get the feature's stop position on the segment
def get_stop(seg,feat):
    s=Segments[seg]
    f=Features[feat]    
    if s.stop<f.stop:
        result=s.size
    else:
        result=f.stop-s.start

    return result

# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
    print("generation of the graph's gff")
    file_out = open(file_out, 'w')

    for segment in Segments:
        # get the list of the features on the segment
        features_seg=Segments[segment].features
        
        # go through all these features, and print the gff line for each
        for feature_stranded in features_seg:
            strand=feature_stranded[0:1]
            feature=feature_stranded[1:]
            segment_stranded=strand+segment
            type=Features[feature].type
            #start=max(1,get_start(segment,feature))
            start=get_start(segment,feature)
            stop=get_stop(segment,feature)
                
            # get the rank and the total number of ocurrences for the feature
            rank=str(Features[feature].segments_list.index(segment_stranded)+1)
            total=str(len(Features[feature].segments_list))

            # create the annotation with the rank information
            annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total

            # generate the gff line
            ligne=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
            #print(ligne)
            file_out.write(ligne)

    file_out.close()


# functions to generate a genome's gff from the graph's gff

# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_start_2(seg_start, seg_ac, feat_id):
    s_start=seg_ac[seg_start][1]
    f_start=get_start(seg_start,feat_id)
    
    start=int(s_start)+int(f_start)
    return start

# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_ac, feat_id):
    s_start=seg_ac[seg_stop][1]
    f_stop=get_stop(seg_stop,feat_id)

    stop=int(s_start)+int(f_stop)
    return stop

# get the position of a piece of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,gene_start,feature,seg_ac):
    
    # start position of the complete gene on the reference
    start_gene_start=Segments[gene_start].start+get_start(gene_start,feature.id)-1
    
    # start position of the piece of the feature on the reference
    start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
    
    # start and stop position of the feature on azucena, to get the length of the piece of the feature
    start_azu=get_start_2(seg_start,seg_ac,feature.id) 
    stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
    
    # start position of the piece of the feature on the complete feature
    start_gene=int(start_first_seg)-int(start_gene_start)+1
    # stop position : start+length
    stop_gene=start_gene+(stop_azu-start_azu)
    
    position=";position="+str(start_gene)+"-"+str(stop_gene)
    #position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
    return position

# get the proportion of the piece of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_ac,feature):
    start_azu=get_start_2(seg_start,seg_ac,feature.id) # start position of the feature on azucena
    stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) # stop position of the feature on azucena
    
    proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
    #proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
    return proportion

# returns the gff line to write in the output file
def write_line(list_seg,i,feat,seg_start,gene_start,seg_ac):
    seg_stop=list_seg[i-1][1:]
    feature=Features[feat]
    chr=seg_ac[seg_start][0]
    strand=list_seg[i-1][0:1]
    
    # start and stop position of the feature on azucena
    start_azu=get_start_2(seg_start,seg_ac,feature.id) 
    stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
    
    proportion=get_proportion(seg_start,seg_stop,seg_ac,feature)
    position=get_position(seg_start,seg_stop,gene_start,feature,seg_ac)
    
    annotation=feature.annot+proportion+position
    
    out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
    return out_line


# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out):
    
    print("generation of the genome's gff ")
    
    # create a dictionnary with the positions of the segments on azucena
    seg_ac={}
    bed=open(pos_seg,'r')
    lines=bed.readlines()
    for line in lines:
        line=line.split()
        s_id=line[3][1:]
        ch=line[0]
        start=line[1]
        stop=line[2]
        seg_ac[s_id]=list([ch,start,stop])
    bed.close()
    
    gff=open(gff,'r')
    file_out = open(out, 'w')
    file_out_alt=open("azucena_chr10_alt.gff",'w')
    file_out2 = open("segments_manquants.txt", 'w')

    lines=gff.readlines()

    diff_list=list()

    stats=True
  
    # get the list of all the features to transfer
    list_feature=list()
    for line in lines:
        id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
        if id not in list_feature:
            list_feature.append(id)
    
    # stats on how many segments are absent in azucena, their average length, etc
    if stats==True:
        seg_first=list()
        seg_middle=list()
        seg_last=list()
        seg_total_abs=list()
        seg_ok=list()
        seg_entier=list()
        seg_total=list()

        feature_missing_first=list()
        feature_missing_middle=list()
        feature_missing_last=list()
        feature_missing_all=list()
        feature_missing_total=list()
        feature_total=list()
        feature_ok=list()

        #size_seg_missing=list()


    # for each feature, get list of the segments where it is and the first and last segment of the feature on the genome
    for feat in list_feature:
        list_seg=Features[feat].segments_list
        size_list=len(list_seg)

        segments_missing=list()

        first_seg="0"
        for segment in list_seg:
            if segment[1:] in seg_ac:
                first_seg=segment[1:]
                break
        last_seg="0"
        for segment in reversed(list_seg):
            if segment[1:] in seg_ac:
                last_seg=segment[1:]
                break
        

        # stats on how many segments are absent in azucena, their average length, etc

        # for the segments
        # manque au début-seg_first
        # manque à la fin-seg_last
        # manque au milieu-seg_middle
        # début milieu et fin, donc feature entier ?-seg_entier
        # nb total de segments dans un gène et pas chez azucena-seg_total_abs
        # présents chez azucena-seg_ok
        # segments total dans une feature (présent ou pas dans azucena)-seg_total

        if (stats==True):

            for segment in list_seg:
                seg_len=Segments[segment[1:]].size
                if segment[1:] not in seg_ac:
                    seg_total.append(seg_len)
                    segments_missing.append(Segments[segment[1:]])
                    #if feat not in feature_missing_total:
                     #   feature_missing_total.append(feat)
                    if segment==list_seg[0]:
                        if segment==list_seg[-1]:
                            seg_entier.append(seg_len) # contient les features en un seul segment, absent chez azu.
                      #      if feat not in feature_missing_all:
                       #         feature_missing_all.append(feat)
                        else: 
                            seg_first.append(seg_len)
                        #    if feat not in feature_missing_first:
                         #       feature_missing_first.append(feat) 
                    elif segment==list_seg[-1]:
                        seg_last.append(seg_len)
                       # if feat not in feature_missing_last:
                        #    feature_missing_last.append(feat) 
                    else:
                        seg_middle.append(seg_len)
                        #if feat not in feature_missing_middle:
                         #   feature_missing_middle.append(feat)
                else: 
                    seg_ok.append(seg_len)


        # for the features : 
        # manque le premier->feature_missing_first
        # manque le dernier->feature_missing_last
        # trou au milieu->feature_missing_middle
        # absent totalement (0) ->feature_missing_all
        # au moins un segment qui manque->feature_missing_total
        # features completes chez azucena->feature_ok  
        # nb features total->feature_total

        if (stats==True):
            feature_total.append(feat)
            # premier dernier ou aucun seg présent :
            if (first_seg=='0') & (last_seg=='0') : # aucun des segments de la feature n'est dans azucena.
                feature_missing_all.append(feat)
            elif first_seg != list_seg[0][1:]: # le premier segment est manquant chez azucena
                feature_missing_first.append(feat)
            elif last_seg!=list_seg[-1][1:]: # le dernier segment est manquant chez azucena
                feature_missing_last.append(feat)

            # parcourir les segments, voir s'ils manquent au milieu
            elif (len(list_seg)!=1) & (feat not in feature_missing_all): # pour pouvoir accéder à l'avant dernier élément
                for segment in list_seg[1-(len(list_seg)-2)]:
                    if segment not in seg_ac:
                        feature_missing_middle.append(feat)
                        break


            # parcourir les segments, voir s'il en manque un n'importe où
            for segment in list_seg:
               if segment[1:] not in seg_ac:
                   if feat not in feature_missing_total:
                        feature_missing_total.append(feat)
                        break

            if feat not in feature_missing_total:
                feature_ok.append(feat)


        # outputs each fragment of each feature, with its position on the new genome and its annotation : 
        # loop that goes through all the segments that have the current feature
        # keeps the first segment present in azucena found, and when it finds a segment absent in azucena, prints the piece of the fragment, and resets the first segment present.
        # at the end of the loop, prints the last piece of the fragment.
        gene_start="empty"
        seg_start="empty"
        seg_stop="empty"

        for i in range(0,size_list):
            if list_seg[i][1:] in seg_ac:
                if gene_start=="empty":
                    gene_start=list_seg[i][1:]
    
                if seg_start=="empty":
                    seg_start=list_seg[i][1:]
                
                #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
                
            else:
                if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
                    
                    out_line=write_line(list_seg,i,feat,seg_start,gene_start,seg_ac)
                    file_out.write(out_line)
                    seg_start="empty"
                    seg_stop="empty"
            
                #else: if the current segment is not in azucena but there is no start, keep looking for a start
                
                
        # print the last piece of the feature in the end
        if list_seg[i][1:] in seg_ac:
            out_line=write_line(list_seg,i+1,feat,seg_start,gene_start,seg_ac)
            file_out.write(out_line)
        
        
        # outputs each feature once, from the first to the last segment present on the new genome and its annotation :
        if (first_seg!='0') & (last_seg!='0'):
            chr=seg_ac[first_seg][0]
            strand=list_seg[i-1][0:1]
            ligne=chr+"  graph_gff   "+Features[feat].type+" "+str(get_start_2(first_seg,seg_ac,feat))+" "+str(get_stop_2(last_seg,seg_ac,feat))+"   .   "+strand+"   .   "+Features[feat].annot+"\n"
            file_out_alt.write(ligne)

        
        # outputs the detail of the missing fragments for each feature :

        for segment in list_seg:
                if segment[1:] not in seg_ac:
                    segments_missing.append(Segments[segment[1:]])

        if (len(segments_missing)!=0) & (first_seg!='0') & (last_seg!='0'):
            
            #print(first_seg,last_seg)
            start2=get_start_2(first_seg,seg_ac,feat)
            stop2=get_stop_2(last_seg,seg_ac,feat)
            size_on_genome=int(stop2)-int(start2)+1
            size_diff=size_on_genome-Features[feat].size
            diff_list.append(int(size_diff))

            ligne=feat+" : "+str(len(segments_missing))+" variations. différence de taille (taille dans le nouveau génome-taille dans nipponbare) : "+str(size_diff)+"\n"
            #taille du gene à l'origine : "+str(Features[feat].size)+", taille du gene sur ce génome : "+str(size_on_genome)+"\n"
            file_out2.write(ligne)

            for segment in segments_missing:
                if (segment.id==Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#premier et pas dernier
                    ligne="premier segment du gène manquant.\n"
                elif (segment.id==Features[feat].segments_list[-1][1:]) & ((segment.id!=Features[feat].segments_list[0][1:])):#dernier et pas premier
                    ligne="dernier segment du gène manquant.\n"
                elif (segment.id!=Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#ni premier ni dernier
                    ligne="variation au milieu du gène, de taille "+str(segment.size)+"\n"
                    # voir si c'est substitution ou indel
                    # print ce qu'il y a en face dans nb

                    #size_seg_missing.append(segment.size)


                file_out2.write(ligne)
            file_out2.write("\n")
        else:
            ligne=feat+": aucune variation.\n\n"
            file_out2.write(ligne)
        

    #print("difference moyenne : "+str(sum(diff_list)/len(diff_list)))
    #import matplotlib.pyplot as plt
    #plt.hist(diff_list, range=(-1000,1000))
    #plt.show()
    #print("segment au millieu taille moyenne : "+str(sum(size_seg_missing)/len(size_seg_missing)))
    

    file_out.close()
    gff.close() 
    file_out2.close()
    file_out_alt.close()

    
    # print stats
    from statistics import median, mean
    if stats==True:

        # prints the stats for the segments
        #print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
        #print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
        #print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
        #print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
        #print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
        #print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
    
        # creates files with the features by category
        feat_miss_first=open('features_missing_first.txt','w')
        feat_miss_first.write("features missing the first segment on azucena :\n")
        for first in feature_missing_first :
            feat_miss_first.write(Features[first].annot)
            if Features[first].type!='gene':
                feat_miss_first.write(';')
                feat_miss_first.write(Features[first].note)
            feat_miss_first.write("\n")
        feat_miss_first.close()
        
        feat_miss_middle=open('features_missing_middle.txt','w')
        feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
        for middle in feature_missing_middle :
            feat_miss_middle.write(Features[middle].annot)
            if Features[middle].type!='gene':
                feat_miss_middle.write(';')
                feat_miss_middle.write(Features[middle].note)
            feat_miss_middle.write("\n")
        feat_miss_middle.close()
        
        feat_miss_last=open('features_missing_last.txt','w')
        feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
        for last in feature_missing_last :
            feat_miss_last.write(Features[last].annot)
            if Features[last].type!='gene':
                feat_miss_last.write(';')
                feat_miss_last.write(Features[last].note)
            feat_miss_last.write("\n")
        feat_miss_last.close()
    
        feat_miss=open('features_missing.txt','w')
        feat_miss.write("\nfeatures missing entirely on azucena :\n")
        for entier in feature_missing_all :
            feat_miss.write(Features[entier].annot)
            if Features[entier].type!='gene':
                feat_miss.write(';')
                feat_miss.write(Features[entier].note)
            feat_miss.write("\n")
        feat_miss.close()

        feat_miss_total=open('features_missing_total.txt','w')
        feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
        for total in feature_missing_total :
            feat_miss_total.write(Features[total].annot)
            if Features[total].type!='gene':
                feat_miss_total.write(';')
                feat_miss_total.write(Features[total].note)
            feat_miss_total.write("\n")
        feat_miss_total.close()
    
        feat_all=open('all_features.txt','w')
        for feat in list_feature:
            feat_all.write(Features[feat].annot)
            if Features[feat].type!='gene':
                feat_all.write(';')
                feat_all.write(Features[feat].note)
            feat_all.write("\n")
        feat_all.close()

        feat_ok=open('features_ok.txt','w')
        feat_ok.write("\nfeatures entirely present in azucena :\n")
        for ok in feature_ok :
            feat_ok.write(Features[ok].annot)
            if Features[ok].type!='gene':
                feat_ok.write(';')
                feat_ok.write(Features[ok].note)
            feat_ok.write("\n")
        feat_ok.close()

        # prints the stats for the features (hypothetical/putative rate, by category)
        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
        total=int(subprocess.getoutput("wc -l < features_missing_first.txt"))
        print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/len(feature_missing_first),2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
        total=int(subprocess.getoutput("wc -l < features_missing_middle.txt"))
        print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/len(feature_missing_middle),2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
        total=int(subprocess.getoutput("wc -l < features_missing_last.txt"))
        print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/len(feature_missing_last),2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
        total=int(subprocess.getoutput("wc -l < features_missing.txt"))
        print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/len(feature_missing_all),2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
        total=int(subprocess.getoutput("wc -l < features_missing_total.txt"))
        print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/len(feature_missing_total),2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
        total=int(subprocess.getoutput("wc -l < features_ok.txt"))
        print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/len(feature_ok),2),"% hypothetical or putative.")

        hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
        total=int(subprocess.getoutput("wc -l < all_features.txt"))
        print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/len(list_feature),2),"% hypothetical or putative.")

        command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
        subprocess.run(command,shell=True)