Skip to content
Snippets Groups Projects
Functions.py 41.1 KiB
Newer Older

# functions to create the segments and the features

NMarthe's avatar
NMarthe committed
# create a segment with its first feature
def init_seg(line,feature):
    line=line.split()
    seg_id=line[3][1:]
    chr=line[0]
    start=line[1]
    stop=line[2]
    
NMarthe's avatar
NMarthe committed
    # add the current feature to the list of features that are on the segment
    feature_stranded=feature_strand+feature
    feat=list()
    feat.append(feature_stranded)
    
NMarthe's avatar
NMarthe committed
    # create the segment
    Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)


NMarthe's avatar
NMarthe committed
# create a feature with the segment its on
    feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
    type=line[6]
    annot=line[12]
NMarthe's avatar
NMarthe committed
    # add the current segment to the list of segments that have the feature
    seg_stranded=strand+seg
    segments_list=list()
    segments_list.append(seg_stranded)

NMarthe's avatar
NMarthe committed
    # if the current feature has a parent, add the current feature in the childs list of its parent
    if annot.split(";")[1].split("=")[0]=="Parent":
NMarthe's avatar
NMarthe committed
    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    
    elif annot.split(";")[2].split("=")[0]=="Parent":
NMarthe's avatar
NMarthe committed
    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
        add_child(parent,feature_id)
    else: parent=""

NMarthe's avatar
NMarthe committed
    # create the feature
    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)


NMarthe's avatar
NMarthe committed
# add a feature to an existing segment
def add_feature(seg,new_feature,strand):
    new_feature_stranded=strand+new_feature
    if new_feature_stranded not in Segments[seg].features:
        Segments[seg].features.append(new_feature_stranded)
    
NMarthe's avatar
NMarthe committed

# add a child feature to an existing feature
NMarthe's avatar
NMarthe committed
    if feat in Features.keys(): # if the parent feature exists
        if new_child not in Features[feat].childs:
            Features[feat].childs.append(new_child)
    
    
NMarthe's avatar
NMarthe committed
# add a segment to an existing feature
def add_seg(feat,new_seg,strand):
    seg_stranded=strand+new_seg
    if seg_stranded not in Features[feat].segments_list:
        Features[feat].segments_list.append(seg_stranded)

NMarthe's avatar
NMarthe committed

# create a note for the child features that do not have annotation. 
NMarthe's avatar
NMarthe committed
    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
NMarthe's avatar
NMarthe committed
    if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
        feat.note=feat.annot.split(';')[-1]
NMarthe's avatar
NMarthe committed
    else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
        # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
NMarthe's avatar
NMarthe committed
        annot_found=False
        while annot_found==False:
            if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
                note=Features[curent].annot.split(';')[-1]
                feat.note=note
                annot_found=True
            else: # if we didn't find the gene, we go back to the current feature's parent until we find it
                curent=Features[Features[curent].parent].id

NMarthe's avatar
NMarthe committed

# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):    

    global Features
    global Segments
    Segments={}
    Features={}
    
NMarthe's avatar
NMarthe committed
    # open the file with the intersect between the segments and the gff
    file = open(intersect_path, 'r')
    lines=file.readlines()
    
    for line in lines:
        # get the ids for the dictionnaries' keys
        feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
        segment_id=line.split()[3][1:]
    
        if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
            init_feature(line)
NMarthe's avatar
NMarthe committed
        else: # if it exists, add the current segment to the list of segments that have the existing feature
            add_seg(feature_id,segment_id,strand)
    
        if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
            init_seg(line, feature_id)
        
NMarthe's avatar
NMarthe committed
        else: # if it exists, add the current feature to the list of features on the existing segment
    # for all the features, add the note (information on the function of the feature), and the positions on the first and last seg.
    for feat_id in Features:
        feat=Features[feat_id]
        set_note(feat.id)
        feat.pos_start=get_start(feat.segments_list[0][1:],feat.id)
        feat.pos_stop=get_stop(feat.segments_list[-1][1:],feat.id)
# functions to generate the graph's gff from the segments and features created with create_seg_feat

# get the feature's start position on the segment
def get_start(seg,feat):
    s=Segments[seg]
    f=Features[feat]
    if s.start>f.start:
        result=1
    else:
        result=f.start-s.start
    return result

NMarthe's avatar
NMarthe committed

# get the feature's stop position on the segment
def get_stop(seg,feat):
    s=Segments[seg]
    f=Features[feat]    
    if s.stop<f.stop:
        result=s.size
    else:
        result=f.stop-s.start
    return result

NMarthe's avatar
NMarthe committed

# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
    print("generation of the graph's gff")
    file_out = open(file_out, 'w')

    for segment in Segments:
        # get the list of the features on the segment
        features_seg=Segments[segment].features
        
        # go through all these features, and print the gff line for each
        for feature_stranded in features_seg:
            strand=feature_stranded[0:1]
            feature=feature_stranded[1:]
            segment_stranded=strand+segment
            type=Features[feature].type
            start=get_start(segment,feature)
            stop=get_stop(segment,feature)
                
            # get the rank and the total number of ocurrences for the feature
            rank=str(Features[feature].segments_list.index(segment_stranded)+1)
            total=str(len(Features[feature].segments_list))

            # create the annotation with the rank information
            annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total

NMarthe's avatar
NMarthe committed
            # write the gff line in the output file
            line=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
            file_out.write(line)
# functions to generate a genome's gff from the graph's gff

# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
NMarthe's avatar
NMarthe committed
def get_start_2(seg_start, seg_genome, feat_id):
NMarthe's avatar
NMarthe committed
    s_start=seg_genome[seg_start][1]
    f_start=get_start(seg_start,feat_id)
    
    start=int(s_start)+int(f_start)
    return start

NMarthe's avatar
NMarthe committed

# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
NMarthe's avatar
NMarthe committed
def get_stop_2(seg_stop, seg_genome, feat_id):
    s_start=seg_genome[seg_stop][1]
NMarthe's avatar
NMarthe committed

# get the position of a part of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,feat_start,feature,seg_genome):
NMarthe's avatar
NMarthe committed
    # start position of the entire feature on the reference
    start_gene_start=Segments[feat_start[1:]].start+get_start(feat_start[1:],feature.id)-1
    
    # start position of the piece of the feature on the reference
    start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
    
    # start and stop position of the feature on azucena, to get the length of the piece of the feature
NMarthe's avatar
NMarthe committed
    start_azu=get_start_2(seg_start,seg_genome,feature.id) 
    stop_azu=get_stop_2(seg_stop,seg_genome,feature.id)
    
    # start position of the piece of the feature on the complete feature
    start_gene=int(start_first_seg)-int(start_gene_start)+1
    # stop position : start+length
    stop_gene=start_gene+(stop_azu-start_azu)
    
    position=";position="+str(start_gene)+"-"+str(stop_gene)
    #position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
    return position

NMarthe's avatar
NMarthe committed

# get the proportion of a part of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_genome,feature):
    start_azu=get_start_2(seg_start,seg_genome,feature.id) # start position of the feature on azucena
    stop_azu=get_stop_2(seg_stop,seg_genome,feature.id) # stop position of the feature on azucena
    
    proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
    #proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
    return proportion

NMarthe's avatar
NMarthe committed

# returns the gff line to write in the output file
NMarthe's avatar
NMarthe committed
def write_line(list_seg,i,feat,seg_start,feat_start,seg_genome):
    seg_stop=list_seg[i-1]
    chr=seg_genome[seg_start[1:]][0]
    strand=list_seg[i-1][0:1]
    
    # start and stop position of the feature on azucena
    start_azu=get_start_2(seg_start[1:],seg_genome,feature.id) 
    stop_azu=get_stop_2(seg_stop[1:],seg_genome,feature.id)
    proportion=get_proportion(seg_start[1:],seg_stop[1:],seg_genome,feature)
    position=get_position(seg_start[1:],seg_stop[1:],feat_start,feature,seg_genome)
    
    annotation=feature.annot+proportion+position
    
    out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
    return out_line



def stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok):

    # creates files with the features by category (missing first, middle, end segment, etc)
    feat_miss_first=open('features_missing_first.txt','w')
    feat_miss_first.write("features missing the first segment on azucena :\n")
    for first in feature_missing_first :
        feat_miss_first.write(Features[first].annot)
        if Features[first].type!='gene':
            feat_miss_first.write(';')
            feat_miss_first.write(Features[first].note)
        feat_miss_first.write("\n")
    feat_miss_first.close()
        
    feat_miss_middle=open('features_missing_middle.txt','w')
    feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
    for middle in feature_missing_middle :
        feat_miss_middle.write(Features[middle].annot)
        if Features[middle].type!='gene':
            feat_miss_middle.write(';')
            feat_miss_middle.write(Features[middle].note)
        feat_miss_middle.write("\n")
    feat_miss_middle.close()
       
    feat_miss_last=open('features_missing_last.txt','w')
    feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
    for last in feature_missing_last :
        feat_miss_last.write(Features[last].annot)
        if Features[last].type!='gene':
            feat_miss_last.write(';')
            feat_miss_last.write(Features[last].note)
        feat_miss_last.write("\n")
    feat_miss_last.close()
    
    feat_miss=open('features_missing.txt','w')
    feat_miss.write("\nfeatures missing entirely on azucena :\n")
    for entier in feature_missing_all :
        feat_miss.write(Features[entier].annot)
        if Features[entier].type!='gene':
            feat_miss.write(';')
            feat_miss.write(Features[entier].note)
        feat_miss.write("\n")
    feat_miss.close()

    feat_miss_total=open('features_missing_total.txt','w')
    feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
    for total in feature_missing_total :
        feat_miss_total.write(Features[total].annot)
        if Features[total].type!='gene':
            feat_miss_total.write(';')
            feat_miss_total.write(Features[total].note)
        feat_miss_total.write("\n")
    feat_miss_total.close()
    
    feat_all=open('all_features.txt','w')
    for feat in list_feature:
        feat_all.write(Features[feat].annot)
        if Features[feat].type!='gene':
            feat_all.write(';')
            feat_all.write(Features[feat].note)
        feat_all.write("\n")
    feat_all.close()

    feat_ok=open('features_ok.txt','w')
    feat_ok.write("\nfeatures entirely present in azucena :\n")
    for ok in feature_ok :
        feat_ok.write(Features[ok].annot)
        if Features[ok].type!='gene':
            feat_ok.write(';')
            feat_ok.write(Features[ok].note)
        feat_ok.write("\n")
    feat_ok.close()

    # prints the stats for the features (hypothetical/putative rate, by category)
    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
    total=len(feature_missing_first)
    print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
    total=len(feature_missing_middle)
    print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
    total=len(feature_missing_last)
    print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
    total=len(feature_missing_all)
    print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
    total=len(feature_missing_total)
    print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
    total=len(feature_ok)
    print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
    total=len(list_feature)
    print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")

    command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
    subprocess.run(command,shell=True)


def gff_detail(list_seg,seg_genome,feat,file_out_detail):
    # outputs each fragment of each feature, with its position on the new genome and its annotation : 
    # loop that goes through all the segments that have the current feature
    # keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the part of the fragment, and resets the first segment present.
    # continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature.
    # etc.
    # at the end of the loop, prints the last part of the fragment.
    size_list=len(list_seg)

    feat_start="empty" # stocks the first segment of the feature
    seg_start="empty" # stocks the first segment of the current part of the feature
    #seg_stop="empty" # stocks the last segment of the current part of the feature

    for i in range(0,size_list):
        if list_seg[i][1:] in seg_genome: # look for a segment present in the genome
            if feat_start=="empty":
                feat_start=list_seg[i]
    
            if seg_start=="empty": # if we dont have a start, take the current segment for the start of the part of the feature
                seg_start=list_seg[i]
                
            #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
                
        else:
            if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
                    
                out_line=write_line(list_seg,i,feat,seg_start,feat_start,seg_genome)
                file_out_detail.write(out_line)
                seg_start="empty"
                #seg_stop="empty"
            
            #else: if the current segment is not in azucena but there is no start, keep looking for a start
                
                
    # print the last piece of the feature in the end
    if list_seg[i][1:] in seg_genome:
        out_line=write_line(list_seg,i+1,feat,seg_start,feat_start,seg_genome)
        file_out_detail.write(out_line)


def gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n):
    # outputs each feature once, from the first to the last segment present on the new genome and its annotation (if the size is ok) :
    bad_size=0
    absent_features=0
    if (first_seg!='0') & (last_seg!='0'):

        start2=get_start_2(first_seg,seg_genome,feat)
        stop2=get_stop_2(last_seg,seg_genome,feat)
        size_on_genome=int(stop2)-int(start2)+1
        size_diff=abs(size_on_genome-Features[feat].size)

        if not ((size_on_genome>Features[feat].size*n) | (size_on_genome<Features[feat].size/n)) : # si le nouveau gene est pas 2x plus grand ou plus petit que l'original
            chr=seg_genome[first_seg][0]
            strand=list_seg[0][0:1]

            start2=get_start_2(first_seg,seg_genome,feat)
            stop2=get_stop_2(last_seg,seg_genome,feat)
            size_on_genome=int(stop2)-int(start2)+1
            size_diff=abs(size_on_genome-Features[feat].size)

            nb_variants=0
            for segment in list_seg:
                if segment[1:] not in seg_genome:
                    nb_variants+=1

            annotation=Features[feat].annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(nb_variants)

            line=chr+"  graph_gff   "+Features[feat].type+" "+str(get_start_2(first_seg,seg_genome,feat))+" "+str(get_stop_2(last_seg,seg_genome,feat))+"   .   "+strand+"   .   "+annotation+"\n"
            file_out.write(line)
        else:
            bad_size+=1
    else :
        absent_features+=1
    
    return list([bad_size,absent_features])



# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
    print("generation of the genome's gff ")
NMarthe's avatar
NMarthe committed
    # create a dictionnary with the positions of the segments on the genome to transfer on
    seg_genome={}
    bed=open(pos_seg,'r')
    lines=bed.readlines()
    for line in lines:
        line=line.split()
        if line[3][0:1]=='>':
            strand='+'
        elif line[3][0:1]=='<':
            strand='-'
        else:
            strand=''
        seg=line[3][1:]
        seg_genome[seg]=list([ch,start,stop,strand])
    # create a dictionnary with the sequences of the segments of the graph and a dictionary with the paths from the gfa. each genome name gives the list of its segments in order
    file_gfa=open(gfa,'r')
    lines_gfa=file_gfa.readlines()
    paths={}
    for line in lines_gfa:
        line=line.split()
        if (line[0]=="S"):
            seg_id='s'+line[1]
            seg_seq[seg_id]=line[2]

        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"):
            
            path=line[6].replace(">",";>")
            path=path.replace("<",";<").split(';')
            list_path=list()
            for segment in path:
                if segment[0:1]=='>':
                    list_path.append('+s'+segment[1:])
                elif segment[0:1]=='<':
                    list_path.append('-s'+segment[1:])
                else:
                    list_path.append('s'+segment[1:])

                #couple=['s'+segment[1:],segment[0:1]]
                ##list_path.append(couple)
                #list_path.append('s'+segment[1:])
 
            paths[line[3]]=list_path


    file_out_detail = open("azucena_chr10_detail.gff", 'w')
    file_out=open(out,'w')
    file_out2 = open("segments_manquants.txt", 'w')

    diff_list=list()
NMarthe's avatar
NMarthe committed
    # get the list of all the features to transfer from the gff
    lines=gff.readlines()
    for line in lines:
        id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
        if id not in list_feature:
            list_feature.append(id)
NMarthe's avatar
NMarthe committed
    # create objects for stats on how many segments are absent in azucena, their average length, etc
    if stats==True:
        seg_first=list()
        seg_middle=list()
        seg_last=list()
        feature_missing_first=list()
        feature_missing_middle=list()
        feature_missing_last=list()
        feature_missing_all=list()
        feature_missing_total=list()
        feature_total=list()
        feature_ok=list()


    # for each feature, get list of the segments where it is and the first and last segment of the feature on the genome
    for feat in list_feature:
        list_seg=Features[feat].segments_list

        first_seg="0"
        for segment in list_seg:
NMarthe's avatar
NMarthe committed
            if segment[1:] in seg_genome:
                first_seg=segment[1:]
                break
        last_seg="0"
        for segment in reversed(list_seg):
NMarthe's avatar
NMarthe committed
            if segment[1:] in seg_genome:
                last_seg=segment[1:]
        # stats on how many segments are absent in azucena, their average length, etc
NMarthe's avatar
NMarthe committed

        # segment missing that is at the start of a feature - seg_first
        # segment missing that is at the end of a feature - seg_last
        # segment missing that is in the middle of a feature - seg_middle
        # segment missing that contains the entire feature - seg_entier
        # total number of genes missing - seg_total_abs
        # segments not missing - seg_ok
        # segments missing of not - seg_total
NMarthe's avatar
NMarthe committed
            for segment in list_seg: # counts the segments in each category
NMarthe's avatar
NMarthe committed
                if segment[1:] not in seg_genome: # the segment is missing
NMarthe's avatar
NMarthe committed
                    if segment==list_seg[0]: # the segment is the first of the feature
                        if segment==list_seg[-1]: # the segment is the last of the feature
                            seg_entier.append(seg_len) # the segment is the first and the last, so it contains the entire feature
NMarthe's avatar
NMarthe committed
                            seg_first.append(seg_len) 
                    elif segment==list_seg[-1]: # the segment is the last of the feature
                        seg_last.append(seg_len)
                    else:
                        seg_middle.append(seg_len)
NMarthe's avatar
NMarthe committed
                else: # the segment is present on the genome
NMarthe's avatar
NMarthe committed
        # the fist segment of the feature is missing - feature_missing_first
        # the last segment of the feature is missing - feature_missing_last
        # at least one middle segment of the feature is missing - feature_missing_middle
        # the entire feature is missing ->feature_missing_all
        # at least one segment is missing first, last, or middle) - feature_missing_total
        # no segment is missing, the feature is complete - feature_ok  
        # total number of features, with missing segments or not - feature_total

        if (stats==True):
            feature_total.append(feat)
NMarthe's avatar
NMarthe committed

            if (first_seg=='0') & (last_seg=='0') : # no segment of the feature is in the genome, the feature is missing entirely
                feature_missing_all.append(feat)
            elif first_seg != list_seg[0][1:]: # the first segment is missing 
                feature_missing_first.append(feat)
            elif last_seg!=list_seg[-1][1:]: # the last segment is missing
                feature_missing_last.append(feat)

NMarthe's avatar
NMarthe committed
            # go through all the segments, check if some are missing in the middle of the feature
            elif (len(list_seg)!=1) & (feat not in feature_missing_all): # to access the second to last element
                for segment in list_seg[1-(len(list_seg)-2)]:
                    if segment[1:] not in seg_genome:
                        feature_missing_middle.append(feat)
                        break


NMarthe's avatar
NMarthe committed
            # go through the segments, to see if one is missing anywhere on the feature
            for segment in list_seg:
NMarthe's avatar
NMarthe committed
               if segment[1:] not in seg_genome:
                   if feat not in feature_missing_total:
                        feature_missing_total.append(feat)
                        break

NMarthe's avatar
NMarthe committed
            # if the feature doesnt have a missing segment, it is complete.         ADD THE PATH CHECK !!
            if feat not in feature_missing_total:
                feature_ok.append(feat)



        # outputs each fragment of each feature, with its position on the new genome and its annotation : 
        gff_detail(list_seg,seg_genome,feat,file_out_detail)
        # outputs each feature once, from the first to the last segment present on the new genome and its annotation (if the size is ok) :
        n=2
        result=gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n)
        bad_size+=result[0]
        absent_features+=result[1]
        # outputs the detail of the missing fragments for each feature :
        dif_list=diff_list+gff_missing(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq)
        diff_list=dif_list
        #print(feat,"ok")
        #gff_missing(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq)
    
    file_out_detail.close()
    file_out.close()
    file_out2.close()
    
    # print stats
    from statistics import median, mean
    if stats==True:
        print(len(Features)-(bad_size+absent_features),"out of",len(Features),"features are transfered.")
        print(bad_size,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
        print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
        print("average length difference of the transfered genes : ",sum(diff_list)/len(diff_list))
        stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok)
        # prints the stats for the segments
        #print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
        #print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
        #print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
        #print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
        #print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
        #print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
    
def gff_missing(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq):
    # outputs the detail of the missing fragments for each feature :
    diff_list=list()
    # get the list of the segments in a feature that are missing in the genome
    segments_missing=list()
    for segment in list_seg:
            if segment[1:] not in seg_genome:
                segments_missing.append(Segments[segment[1:]])

    # if there are missing segments, print them
    if (first_seg!='0') & (last_seg!='0'):
        var=0 # count variations, to see if there is any

        feature=Features[feat]
        feat_start=feature.start

        # get the lengths of the feature, on the original genome and on the new one
        start2=get_start_2(first_seg,seg_genome,feat)
        stop2=get_stop_2(last_seg,seg_genome,feat)
        size_on_genome=int(stop2)-int(start2)+1
        size_diff=size_on_genome-feature.size
        diff_list.append(abs(int(size_diff))) # to have stats on the size of the segments missing
        line=feat+" : \nlength on the original genome = "+str(feature.size)+"\nlength difference (length in the new genome-length in the original genome) = "+str(size_diff)+"\n"
        file_out2.write(line)
        
        # find the path in azucena. 
        first_strand=seg_genome[first_seg][3]
        first_seg_stranded=first_strand+first_seg
        last_strand=seg_genome[last_seg][3]
        last_seg_stranded=last_strand+last_seg
        id_first_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(first_seg_stranded))
        id_last_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(last_seg_stranded))
        list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][id_first_seg:id_last_seg+1]
        list_segfeat_nb=Features[feat].segments_list

        # get list segments present in azucena.
        # for segment in list_seg:

        # boucles parcourir les deux listes.
        i=0
        j=0
        last=''
        last_taille=0
        last_seq=''
        last_start=''
        start=0
        pos=''

        # check if the gene is inverted :
        # if all the segments in common (no strand) have the oposite strand, change the strand (or ignore it ?).
        # the list from the paths are stranded. 
        # first get the list of the segments in common without the strand
        # then compare the strands
        # if all the strands are different, print 'there is an inversion'.
        # else print nothing. then ignore the strand.

        # check if there is an inversion
        if detect_inversion(list_segfeat_nb,list_segfeat_azu):
            line="inversion detected\n"
            file_out2.write(line)
        # remove the strands from the lists of segments :
        list_segfeat_nb=remove_strand(list_segfeat_nb)
        list_segfeat_azu=remove_strand(list_segfeat_azu)

        # detect and print variations ignoring the strands
        while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
        #for i in range(0,len(list_segfeat_nb)):
            if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
                if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
                    if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
                        # is both segments are absent in the other genome, its a substitution

                        # print if we had an insertion or deletion running
                        if last=='insertion':
                            pos=int(Segments[last_start].start)-int(feat_start)+1
                            line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
                            file_out2.write(line)
                            var+=1
                        elif last=='deletion':
                            if start==1:
                                pos=int(Segments[last_start].start)-int(feat_start)+1
                                line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
                                start=0
                                pos=int(Segments[last_start].start)-int(feat_start)+1
                                line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
                            file_out2.write(line)
                            var+=1
                        last=''
                        last_taille=0
                        last_seq=''

                        # print the substitution
                        # substitution of segment list_segfeat_nb[i][1:] by segment list_segfeat_azu[j][1:]
                        if i==0:
                            pos=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
                            line='substitution at the start of the feature of '+list_segfeat_nb[i]+' '+seg_seq[list_segfeat_nb[i]]+" by "+list_segfeat_azu[j]+' '+seg_seq[list_segfeat_azu[j]]+" at the position "+str(pos)+'\n'
                        else:
                            pos=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
                            line='substitution of '+list_segfeat_nb[i]+' '+seg_seq[list_segfeat_nb[i]]+" by "+list_segfeat_azu[j]+' '+seg_seq[list_segfeat_azu[j]]+" at the position "+str(pos)+'\n'
                        file_out2.write(line)
                        var+=1
                        # go to the next segments on the paths
                        i+=1
                        j+=1
                    else: # azu segment not in nb, but nb segment in azu : insertion
                            if start==1:
                                pos=int(Segments[last_start].start)-int(feat_start)+1
                                line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
                                start=0
                            else:
                                pos=int(Segments[last_start].start)-int(feat_start)+1
                                line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
                            file_out2.write(line)
                            var+=1
                            last_start=''
                            last_seq=''
                        if last=='insertion':
                            last_taille+=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
                            last_seq=last_seq+seg_seq[list_segfeat_azu[j]]
                            last='insertion'
                            last_seq=seg_seq[list_segfeat_azu[j]]
                            last_taille=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
                            last_start=list_segfeat_nb[i]
                        j+=1
                        
                elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion
                        pos=int(Segments[last_start].start)-int(feat_start)+1
                        line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
                        file_out2.write(line)
                        var+=1
                        last=''
                        last_start=''
                        last_taille=0
                        last_seq=''
                    if last=='deletion':
                        last_taille+=Segments[list_segfeat_nb[i]].size
                        last_seq=last_seq+seg_seq[list_segfeat_nb[i]]
                        last='deletion'
                        last_start=list_segfeat_nb[i]
                        last_taille=Segments[list_segfeat_nb[i]].size
                        last_seq=seg_seq[list_segfeat_nb[i]]
                        if i==0:
                            start=1
                    i+=1
                    
                else : # idk yet.
                    line="weird order change"
                    file_out2.write(line)
                    var+=1
                    i+=1
                    j+=1

            else: # segment present in both. print the running indel if there is one
                #print("segment",list_segfeat_nb[i][1:],'ok')
                if last=='insertion':
                    pos=int(Segments[last_start].start)-int(feat_start)+1
                    line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
                    file_out2.write(line)
                    var+=1
                elif last=='deletion':
                    if start==1:
                        pos=int(Segments[last_start].start)-int(feat_start)+1
                        line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
                        start=0
                        pos=int(Segments[last_start].start)-int(feat_start)+1
                        line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
                    file_out2.write(line)
                    var+=1
                last=''
                last_taille=0
                last_start=''
                last_seq=''
                i+=1
                j+=1
        # finish printing the current indel
        if last=='insertion':
            pos=int(Segments[last_start].start)-int(feat_start)+1
            line='insertion of '+last_seq+" at the position "+str(pos)+'\n'
NMarthe's avatar
NMarthe committed
            file_out2.write(line)
            var+=1
        elif last=='deletion':
            if start==1:
                pos=int(Segments[last_start].start)-int(feat_start)+1
                line='deletion at the start of the feature of '+last_seq+" at the position "+str(pos)+'\n'
                start=0
            else:
                pos=int(Segments[last_start].start)-int(feat_start)+1
                line='deletion of '+last_seq+" at the position "+str(pos)+'\n'
NMarthe's avatar
NMarthe committed
            file_out2.write(line)
        # see if the end is missing for one of the two genomes
        # si i a la taille max, peut etre que j aussi est arrivé à la fin, si j+1 est trop grand, donc si j==len(list_segfeat_azu)-1
        # si i et j taille max (ou i et j+1), c bon les deux sont arrivés à la fin (je crois).
        # si i taille max et pas j, insertion. si j taille max et pas i, délétion. 
        if not((i>=len(list_segfeat_nb)-1) & (j>=len(list_segfeat_azu)-1)):
            if (i<len(list_segfeat_nb)-1):
                if not (j>=len(list_segfeat_azu)-1):
                    print('pb')
                # length of the deletion = position of the end of the feature - position of the start of the deletion
                length=feature.stop-Segments[list_segfeat_nb[i]].start
                line="deletion at the end of length "+str(length)+"\n"
                file_out2.write(line)
                var+=1
            elif (j<len(list_segfeat_azu)-1):
                file_out2.write("insertion at the end\n")
                var+=1
        # au début et à la fin, si on a une délétion, on peut que donner la taille du segment, pas la taille de la délétion. 
        # dans le segment ou la feature, garder l'info de où commence le gène sur le segment..... 
        # taille de la délétion = (début segment fin + position de fin) - début last_seg
        # ici last seg c list_nb[i], et segment fin c'est list_nb[-1]
        if var==0:
            line=feat+" : no variation\n"
            file_out2.write(line)
    else:
        line=feat+" : feature entirely absent.\n"
        file_out2.write(line)
    file_out2.write('\n')

    return diff_list


def remove_strand(list_seg):
    list_unstrand=list()
    for seg_stranded in list_seg:
        list_unstrand.append(seg_stranded[1:])
    return list_unstrand

def detect_inversion(list_1,list_2):
    list_1_unstrand=remove_strand(list_1)
    list_2_unstrand=remove_strand(list_2)

    # get the list of segments in common
    seg_common=list()
    for seg in list_1_unstrand:
        if seg in list_2_unstrand:
            seg_common.append(seg)
    
    # for each segment in common, check if the strand is the same. check index in list unstranded to get the segment in list stranded
    same_strand_count=0
    for seg in seg_common:
        index_1=list_1_unstrand.index(seg)
        index_2=list_2_unstrand.index(seg)
        if list_1[index_1]==list_2[index_2]:
            same_strand_count+=1

    # if there is less than 10% of strand difference among the common segments, return False, no inversion 
    if len(seg_common)-same_strand_count<len(seg_common)/10:
        return False
    else:
        return True