Skip to content
Snippets Groups Projects
Functions.py 47.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • from Graph_gff import Segments, Features, get_feature_start_on_segment, get_feature_stop_on_segment,invert_seg,search_segment
    
    global segments_on_target_genome
    segments_on_target_genome={}
    
    def get_seg_occ(seg,walk,feat,copy):
        seg_in_walk=segments_on_target_genome[seg][walk]
        for seg_occurence in seg_in_walk:
    
            for feat_copy in seg_occurence[5:]:
                if (feat_copy[0]==feat) & (feat_copy[1]==copy):
    
                    return seg_occurence # [chr,start,stop,strand,index,copies]
    
        print("no occ found ???",seg,walk,feat,copy)
        print(segments_on_target_genome[seg][walk])
        print(Features[feat].segments_list_source)
        print(Features[feat].segments_list_target)
        exit()
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # get the start position of the features on the linear target genome, using their coordinates on the graph and the coordinantes of the segments on the genome
    
    def get_feature_start_on_target_genome(start_seg,feat_id,walk,copy_id):
        seg_start_pos=get_seg_occ(start_seg,walk,feat_id,copy_id)[1]
    
        feat_start_pos=get_feature_start_on_segment(start_seg,feat_id)    
    
        return seg_start_pos+feat_start_pos-1
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # get the stop position of the features on the linear target genome, using their coordinates on the graph and the coordinantes of the segments on the genome
    
    def get_feature_stop_on_target_genome(stop_seg,feat_id,walk,copy_id):
        seg_start_pos=get_seg_occ(stop_seg,walk,feat_id,copy_id)[1]
    
        feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id)
    
        return seg_start_pos+feat_stop_pos-1
    
    NMarthe's avatar
    NMarthe committed
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # get the start position of the features on the linear target genome for inverted features
    
    def get_feature_start_on_target_genome_inv(start_seg,feat_id,walk,copy_id):
        seg_end_pos=get_seg_occ(start_seg,walk,feat_id,copy_id)[2]
    
        feat_start_pos=get_feature_start_on_segment(start_seg,feat_id)
        return seg_end_pos-feat_start_pos+1
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # get the stop position of the features on the linear target genome for inverted features
    
    def get_feature_stop_on_target_genome_inv(stop_seg,feat_id,walk,copy_id):
        seg_end_pos=get_seg_occ(stop_seg,walk,feat_id,copy_id)[2]
    
        feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id)
        return seg_end_pos-feat_stop_pos+1
    
    
    
    
    # functions to get the gff with one line per feature
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    
    # check if the length of the feature on the target genome passes the filter max_diff
    
    def right_size(size,max_diff,feat):
        if max_diff==0:
    
        return not ((size>Features[feat].size*max_diff) or (size<Features[feat].size/max_diff)) 
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # generates the line for the gff of the target genome
    
    def create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,cov,id,copy_id):
        [chr,strand,feature]=[get_seg_occ(first_seg,walk,feature_id,copy_id)[0],Features[feature_id].strand,Features[feature_id]]
    
        annotation=f'{feature.annot};Size_diff={size_diff};coverage={cov};sequence_ID={id}' # Nb_variants={var_count};
    
            start=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk,copy_id)
            stop=get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk,copy_id)
    
            start=get_feature_start_on_target_genome(first_seg,feature_id,walk,copy_id)
            stop=get_feature_stop_on_target_genome(last_seg,feature_id,walk,copy_id)
    
        output_line=f'{chr}\tGrAnnoT\t{feature.type}\t{start}\t{stop}\t.\t{strand}\t.\t{annotation}\n'
    
        return output_line
    
    # functions to get the alignment for the transfered genes
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # creates an alignment for two segments
    
    def segment_aln(type,seg_seq,seg_a,seg_b,first,feature_id,last):
    
                    seq_aln=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
    
                    seq_aln=get_segment_sequence(seg_seq,seg_a)[:feature.pos_stop]
    
                    seq_aln=get_segment_sequence(seg_seq,seg_a)
    
                line_a=seq_aln
                line_b=seq_aln
                len_aln=len(seq_aln)
                line_c=len_aln*"*"
            case "substitution":
    
                seq_aln_a=get_segment_sequence(seg_seq,seg_a)
                seq_aln_b=get_segment_sequence(seg_seq,seg_b)
    
                len_a=len(seq_aln_a)
                len_b=len(seq_aln_b)
                if len_a>len_b:
                    diff_len=len_a-len_b
                    line_a=seq_aln_a
                    line_b=seq_aln_b+diff_len*"-"
                    line_c=len_a*" "
                else:
                    diff_len=len_b-len_a
                    line_a=seq_aln_a+diff_len*"-"
                    line_b=seq_aln_b
                    line_c=len_b*" "
            case "insertion":
    
                seq_aln_b=get_segment_sequence(seg_seq,seg_b)
    
                len_b=len(seq_aln_b)
                line_a=len_b*"-"
                line_b=seq_aln_b
                line_c=len_b*" "
            case "deletion":
    
                    seq_aln_a=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
    
                    seq_aln_a=get_segment_sequence(seg_seq,seg_a)
    
                len_a=len(seq_aln_a)
                line_a=seq_aln_a
                line_b=len_a*"-"
                line_c=len_a*" "
            case "end_deletion":
                seq_aln_a=""
    
                    seq_aln_a+=get_segment_sequence(seg_seq,segment)
    
                seq_aln_a+=get_segment_sequence(seg_seq,seg_a[-1])[0:feature.pos_stop] # for the last segment, only take the part that the feature is on
    
                len_a=len(seq_aln_a)
                line_a=seq_aln_a
                line_b=len_a*"-"
                line_c=len_a*" "
    
    
        return [line_a,line_b,line_c,False] # check the orientation of the segment later
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # formats the alignment lines
    
    def parse_aln_lines(line_a,line_b,line_c,feature_id):
    
        if (len(line_a)!=len(line_b)) or (len(line_b)!=len(line_c)):
    
            print("line lengths differ in alignment")
    
        nb_res_a=0
        nb_res_b=0
    
        while len_parsed<len_to_parse:
            len_header=len(feature_id)+11
            headers=[feature_id+"_source    ",feature_id+"_target    ",len_header*" "]
    
            add_a=line_a[len_parsed:len_parsed+60]
            add_b=line_b[len_parsed:len_parsed+60]
            add_c=line_c[len_parsed:len_parsed+60]
            nb_res_a+=len(add_a)-add_a.count("-")
            nb_res_b+=len(add_b)-add_b.count("-")
    
            aln_line+=f'{headers[0]}{add_a}    {nb_res_a}\n'
            aln_line+=f'{headers[1]}{add_b}    {nb_res_b}\n'
            aln_line+=f'{headers[2]}{add_c}\n\n'
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # creates the alignment for a feature
    
    def create_line_aln(feature_path_source_genome,feature_path_target_genome,seg_seq,feature_id):
    
        first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
        last=False # same for the last part of the feature
    
        while (i<len(feature_path_source_genome)) and (j<len(feature_path_target_genome)):
    
            if i==len(feature_path_source_genome)-1:
                last=True
    
            if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths
                if feature_path_target_genome[j] not in feature_path_source_genome: # if the segment in target genome is absent in source genome
    
                    if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome : substitution
                        [add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
    
                    else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
                        [add_a,add_b,add_c,first]=segment_aln("insertion",seg_seq,"",feature_path_target_genome[j],first,feature_id,last)
    
                elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
                    [add_a,add_b,add_c,first]=segment_aln("deletion",seg_seq,feature_path_source_genome[i],"",first,feature_id,last)
    
                    line_a+=add_a;line_b+=add_b;line_c+=add_c
                    i+=1
                else : # if both segments are present in the other genome but not at the same position. weird case never found yet
    
                    [add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
    
            else: # segment present in both, no variation. 
                [add_a,add_b,add_c,first]=segment_aln("identity",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
    
        if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
    
            [add_a,add_b,add_c,first]=segment_aln("end_deletion",seg_seq,feature_path_source_genome[i:],"",first,feature_id,last)
    
        return parse_aln_lines(line_a,line_b,line_c,feature_id)
    
    NMarthe's avatar
    NMarthe committed
    # functions to output the stats on the transfer
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # stats about missing segments and feature type, not used, will change.
    
    def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feature_id,walk):
    
    NMarthe's avatar
    NMarthe committed
    # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
        feature_missing_segments[5].append(feature_id)
    
    
        if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely
    
    NMarthe's avatar
    NMarthe committed
            feature_missing_segments[3].append(feature_id)
    
        elif first_seg != list_seg[0]: # the first segment is missing 
    
    NMarthe's avatar
    NMarthe committed
            feature_missing_segments[0].append(feature_id)
    
        elif last_seg!=list_seg[-1]: # the last segment is missing
    
    NMarthe's avatar
    NMarthe committed
            feature_missing_segments[2].append(feature_id)
    
        # go through all the segments, check if some are missing in the middle of the feature
    
        elif (len(list_seg)!=1) and (feature_id not in feature_missing_segments[3]): # to access the second to last element
    
    NMarthe's avatar
    NMarthe committed
            for segment in list_seg[1-(len(list_seg)-2)]:
    
                if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
    
    NMarthe's avatar
    NMarthe committed
                    feature_missing_segments[1].append(feature_id)
                    break
    
        # go through the segments, to see if one is missing anywhere on the feature
        for segment in list_seg:
    
            if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
    
    NMarthe's avatar
    NMarthe committed
                if feature_id not in feature_missing_segments[4]:
                    feature_missing_segments[4].append(feature_id)
                    break
    
        # if the feature doesnt have a missing segment, it is complete.         ADD THE PATH CHECK FOR INSERTIONS !!
        if feature_id not in feature_missing_segments[4]:
            feature_missing_segments[6].append(feature_id)
    
    def get_annot_features(list_features):
        list_annot_features=[]
        for feature in list_features:
            list_annot_features.append(Features[feature].note)
        return list_annot_features
    
    def count_hypput_total(list_annot_first):
        total=len(list_annot_first)
        count_hypput=0
        for annot in list_annot_first:
    
            if ("hypothetical" in annot) or ("putative" in annot):
    
    NMarthe's avatar
    NMarthe committed
                count_hypput+=1
        return [count_hypput,total]
    
    # print stats on the transfer : number of feature that have segments in different positions missing. 
    def stats_features(feature_missing_segments):
    # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
        list_annot_first=get_annot_features(feature_missing_segments[0])
        [hyp_put,total]=count_hypput_total(list_annot_first)
        print("\nthe first segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
    
        list_annot_middle=get_annot_features(feature_missing_segments[1])
        [hyp_put,total]=count_hypput_total(list_annot_middle)
        print("a middle segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
    
        list_annot_last=get_annot_features(feature_missing_segments[2])
        [hyp_put,total]=count_hypput_total(list_annot_last)
        print("the last segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
    
        list_annot_all=get_annot_features(feature_missing_segments[3])
        [hyp_put,total]=count_hypput_total(list_annot_all)
        print(total,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
    
        list_annot_total=get_annot_features(feature_missing_segments[4])
        [hyp_put,total]=count_hypput_total(list_annot_total)
        print("there is at least one segment missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
    
        list_annot_ok=get_annot_features(feature_missing_segments[6])
        [hyp_put,total]=count_hypput_total(list_annot_ok)
        print(total ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
    
        list_annot_features=get_annot_features(feature_missing_segments[5])
        [hyp_put,total]=count_hypput_total(list_annot_features)
        print("there is", total,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
    
    
    
    
    # functions to generate the different gffs
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segmnet list)
    
    def get_segments_positions_on_genome(pos_seg): # add to the dict the info about the segments.
    
    NMarthe's avatar
    NMarthe committed
        bed=open(pos_seg,'r')
        lines=bed.readlines() # read line by line ?
        bed.close()
    
        file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
    
    NMarthe's avatar
    NMarthe committed
        for line in lines:
            line=line.split()
    
            [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
    
            # check if segment present twice on the same walk ???
            #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
            if seg not in segments_on_target_genome:
                segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
    
            if file_name not in segments_on_target_genome[seg]:
                segments_on_target_genome[seg][file_name]=list()
            segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # look for the segment on either strand of the target genome
    
    def search_seg_on_target_genome(segment):
        inverted_segment=invert_seg(segment)
        if segment in segments_on_target_genome:
            #if inverted_segment in segments_on_target_genome:
            #    print(segment," found in both orientations")
            return segment
        elif inverted_segment in segments_on_target_genome:
            #print("inverted seg found *****")
            return inverted_segment
        else:
            return False
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # look for a segment on a walk, in either orientations
    def search_seg_on_walk(segment,walk): # for now just print the first found, look for several later...
    
        inverted_segment=invert_seg(segment)
        if segment in segments_on_target_genome:
            if walk in segments_on_target_genome[segment]:
                return segment
        elif inverted_segment in segments_on_target_genome:
            if walk in segments_on_target_genome[inverted_segment]:
                return inverted_segment
        else:
            return False
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # generates a dictionnary that associaces the segments to their sequence : s5->AGGCTAA
    
    def get_segments_sequence(segments_file,segments_list):
        file_segments=open(segments_file,'r')
        lines_segments=file_segments.readlines()
        file_segments.close()
    
    NMarthe's avatar
    NMarthe committed
        seg_seq={}
    
    NMarthe's avatar
    NMarthe committed
            line=line.split()
    
            seg_id='s'+line[1]
            if seg_id in segments_list:
                seg_seq[seg_id]=line[2]
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # generates a dictionnary that associates a walk_name to a list of segments : chr10->[>s1,>s2,>s4]
    
    def get_paths(walks_file,target_genome):
        file_walks=open(walks_file,'r')
        lines_walks=file_walks.readlines()
        file_walks.close()
    
            seq_name=line[1]+"_"+line[3]
            if target_genome in seq_name: # get the walk of the genome
    
                path=line[6].split(',')[1:]
    
                list_segments=[]
                for segment in path:
                    if segment[0:1]=='>':
                        list_segments.append('>s'+segment[1:])
                    elif segment[0:1]=='<':
                        list_segments.append('<s'+segment[1:])
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # get the first and last segment of the list that is in the target genome (possibly several pairs)
    def get_first_last_seg(list_seg):
    
    
        list_walks=get_walks_feature_cross(list_seg) # get all the walks where there is a segment of the feature
        for walk in list_walks: # find the first and last seg for each walk
            for segment in list_seg: # look for first_seg
                seg_found=search_seg_on_walk(segment,walk)
                if seg_found:
                    first_seg_found=seg_found
                    break
            if first_seg_found!='': # look for last_seg
                for segment in reversed(list_seg):
                    last_seg_found=search_seg_on_walk(segment,walk)
                    if last_seg_found:
    
            list_first_last_segs.append([first_seg_found,last_seg_found,walk_found])
            [first_seg_found,last_seg_found,walk_found]=['','','']
        # return all the match
        return list_first_last_segs
    
    # functions to get the detail of the variations in the features
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # find all the walks that contain a segment of the feature (list_seg is the walk of the feature on the source genome)
    
    def get_walks_feature_cross(list_seg):
        list_walks=list()
        for segment in list_seg:
            seg_found=search_seg_on_target_genome(segment)
            if seg_found: # if the segment or the reverse complement is on the target genome
                for walk in segments_on_target_genome[seg_found]:
                    if walk not in list_walks:
                        list_walks.append(walk)
        return list_walks
    
    
    # returns a list of feature's child, and their childs' childs, etc. 
    def get_child_list(feature_id):
        feature=Features[feature_id]
        list_childs=[]
        for child in feature.childs:
            list_childs.append(child) # add the child to the list
            list_childs+=get_child_list(child) # add the child's childs to the list
        return list_childs
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # add the paths of the feature on the target genome in the object Feature
    
    def add_target_genome_paths(feature_id,target_genome_paths):
    
        feature=Features[feature_id]
        list_seg=feature.segments_list_source
    
        list_first_last_segs=get_first_last_seg(list_seg)
    
    
        for match in list_first_last_segs: # for each walk that has the feature
    
            # get the first and last segments of all the copies on this walk
            first_last_segs_list=find_gene_copies(list_seg,walk_name,feature_id)
    
            copy_number=0
            for first_seg,last_seg in first_last_segs_list: # get the feature path for all the copies
                copy_number+=1
                copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg
    
                feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id)
    
                feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature
    
                feature.segments_list_target.append(feature_path)
    
                # get the pos start of the first and last segments (existing because done in find_gene_copies)
    
                # then look for segs that are between.
                walk_start=get_seg_occ(feature_target_path[0],walk_name,feature_id,copy_id)[1]
                walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id)[2]
                feat_copy=(feature_id,copy_id)
    
                for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies
    
                    for segment in segments_on_target_genome[seg][walk_name]: # find the right occurence
                        if walk_start <= segment[1] <= walk_stop: 
    
                            segment.append(feat_copy) # annotate the segment : feature_id, copy_id
    
                # add the paths of the gene's child features
                add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths)
         
        if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ?
            feature.segments_list_target.append(['','',[]])
    
    def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths):
        childs_list=get_child_list(feature_id)
        for child_id in childs_list:
            child=Features[child_id]
    
            # find child path in parent's path
            [child_first_seg,child_last_seg]=['','']
            for seg in child.segments_list_source:
                if seg in feature_target_path: # parent path
                    child_first_seg=seg
                    break
                elif invert_seg(seg) in feature_target_path:
                    child_first_seg=invert_seg(seg)
                    break
    
            if child_first_seg!='': # the child feature may be absent in this occurence of the parent
                for seg in reversed(child.segments_list_source):
                    if seg in feature_target_path: # parent path
                        child_last_seg=seg
                        break
                    elif invert_seg(seg) in feature_target_path:
                        child_last_seg=invert_seg(seg)
                        break
    
                # get the path of the child feature
                child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id)
                feat_copy=(child_id,copy_id)
    
                feature_path=[walk_name,copy_id]
                feature_path.append(child_path)
                child.segments_list_target.append(feature_path)
    
                for segment_id in child_path: # annotate the segments with info about the occurence of the child feature
                    segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id)
                    segment.append(feat_copy)
    
    # find all the copies of the segments from the source list in the target genome
    def find_all_seg(list_seg_source,walk_name):
    
        list_seg_target=[] # contains list of info for each seg : [seg_id,seg_strand,start_pos,index_on_source_walk]
        list_seg_source_unstranded=[] # contains the path in the source genome, but with the strand separated from the segment_id : [s24,>]
    
        for seg in list_seg_source:
            list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>]
            seg_inverted=invert_seg(seg)
            # look for all the segment copies in the target genome walk, in both orientations
    
            if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): # if the segment is in the target genome on the right walk (chr,ctg)
                for copy in segments_on_target_genome[seg][walk_name]: # take all its copies
    
                    seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4]
                    list_seg_target.append(seg_info)
    
            if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : # same but with the segment inverted
    
                for copy in segments_on_target_genome[seg_inverted][walk_name]:
                    seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index]
                    list_seg_target.append(seg_info)
            index+=1
        list_seg_target.sort(key=sort_seg_info) # order the list of segments by start position
    
        return [list_seg_target,list_seg_source_unstranded]
    
    def find_gene_copies(list_seg_source,walk_name,feature_id):
        # find all copies of all segments from the gene in the target genome (in both orientations)
        [list_seg_target,list_seg_source_unstranded]=find_all_seg(list_seg_source,walk_name)
        # find each copy of the gene in the ordered list of segments
        [first_segs_list,last_segs_list]=detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id)
    
        # join the first and last seg lists
        first_last_segs_list=[]
        index=0
        for first_seg in first_segs_list:
            last_seg=last_segs_list[index]
            pair=(first_seg,last_seg)
            first_last_segs_list.append(pair)
            index+=1
    
        return first_last_segs_list # return a list of pairs (first_seg,last_seg)
    
    # called by find_gene_copies
    def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id):
        # prepare the variables for the first iteration of the for loop
        [old_id,old_strand,old_start,old_index]=list_seg_target[0]
        [first_segs_list,last_segs_list]=[[],[]]
        old_seg_id=old_strand+old_id
    
    
        copy_number=1
        copy_id="copy_"+str(copy_number)
        feat_copy=(feature_id,copy_id)
    
        for segment in segments_on_target_genome[old_seg_id][walk_name]: # look for the first seg to add the occurence info
    
            if segment[1]==old_start:
                copy_id="copy_"+str(copy_number)
                feat_copy=(feature_id,copy_id)
                segment.append(feat_copy)
                break
    
        # adjust old_index for the first iteration of the loop
        first_inversion=(old_strand!=list_seg_source_unstranded[old_index][1])
        if first_inversion:
            old_index+=1
        else:
            old_index-=1
    
    
        for seg in list_seg_target: # find and annotate (with feat_copy) all the first and last segments of the feature copies
            [new_id,new_strand,new_start,new_index]=seg
            new_seg_id=new_strand+new_id
    
            inversion=(seg[1]!=list_seg_source_unstranded[new_index][1]) # inversion if this segment's strand is not the same as in the source walk
    
            if inversion : 
    
                if not((old_strand==new_strand) and (old_index>new_index)): # not (if the index decreases and the strand stays the same, it is the same gene copy)
    
                    add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) # add info for the last seg of the previous copy
    
    
                    copy_id="copy_"+str(copy_number)
                    feat_copy=(feature_id,copy_id) # new feature copy, change the info
    
    
                    add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) # add info for the first seg of the new copy
    
                if not((old_strand==new_strand) and (old_index<new_index)): # not (if the index increases and the strand stays the same, it is the same gene copy)
    
                    add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) # add info for the last seg of the previous copy
    
    
                    copy_id="copy_"+str(copy_number)
                    feat_copy=(feature_id,copy_id) # new feature copy, change the info
    
    
                    add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) # add info for the first seg of the new copy
    
            [old_strand,old_index,old_seg_id,old_start]=[new_strand,new_index,new_seg_id,new_start]
    
        # add the last seg info
    
        add_occurence_info_seg(old_seg_id,walk_name,new_start,feat_copy) # add info for the last seg of the last copy
    
        return [first_segs_list,last_segs_list]
    
    def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy):
        for segment in segments_on_target_genome[target_seg_id][walk_name]: # look for the right occurence of the segment
            if segment[1]==target_start:
                segment.append(feat_copy) # add seg info
                break
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # find the feature's path in target genome walk
    
    def get_feature_path(target_genome_path,first_seg,last_seg,walk_name,copy_id,feature_id):
        # look for first_seg and last_seg that has the right copy_id for this feature
    
        first_seg_index=get_seg_occ(first_seg,walk_name,feature_id,copy_id)[4]
        last_seg_index=get_seg_occ(last_seg,walk_name,feature_id,copy_id)[4]
    
        first_index=min(first_seg_index,last_seg_index)
        last_index=max(first_seg_index,last_seg_index)
    
        feature_path_target_genome=target_genome_path[first_index:last_index+1]
    
        return feature_path_target_genome
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # count the variations between two lists
    
    def count_variations(feature_id,target_list):
    
        feature=Features[feature_id]
        if len(target_list)!=0:
            source_list=feature.segments_list_source
    
            inversion=detect_feature_inversion(source_list,target_list)
    
            target_dict=dict.fromkeys(target_list,"")
    
            source_dict=dict.fromkeys(source_list,"") # convert list into dict to search segments in dict quicker.
            var_count=0
            for segment in source_dict:
                if segment not in target_dict:
                    var_count+=1
            for segment in target_dict:
                if segment not in source_dict:
                    var_count+=1
            # this counts the substitutions twice, as insertion+deletion.
        return var_count
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # get the coverage and sequence id of a feature
    
    def get_id_cov(feature_id,seg_size,target_list): # seg_size has unoriented segments : s25
    
        feature=Features[feature_id]
        source_list=feature.segments_list_source
    
    
        inversion=detect_feature_inversion(source_list,target_list)
        if inversion:
            target_list=invert_segment_list(target_list)
    
    
        [match,subs,inser,delet]=[0,0,0,0]
    
        [i,j]=[0,0]
        first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
        last=False # same for the last part of the feature # for id and ins.
    
        while (i<len(source_list)) and (j<len(target_list)):
            if i==len(source_list)-1:
                last=True
            if source_list[i] != target_list[j]: # if there is a difference between the two paths
                if target_list[j] not in source_list: # if the segment in target genome is absent in source genome
                    if source_list[i] not in target_list: # if the segment in source genome is absent is target genome : substitution
                        add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
                        match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
                        i+=1;j+=1
                    else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
                        add=segment_id_cov("insertion",seg_size,source_list[i],target_list[j],first,feature,last)
                        match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
                        j+=1
                elif source_list[i] not in target_list: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
                    add=segment_id_cov("deletion",seg_size,source_list[i],target_list[j],first,feature,last)
                    match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
                    i+=1
                else : # if both segments are present in the other genome but not at the same position. weird case never found yet
                    add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
                    match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
                    i+=1;j+=1
            else: # segment present in both, no variation. 
                add=segment_id_cov("identity",seg_size,source_list[i],target_list[j],first,feature,last)
                match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
                i+=1;j+=1
    
        if i<=len(source_list)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
            add=segment_id_cov("end_deletion",seg_size,source_list[i:],'',first,feature,last)
            match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
    
        cov=round((match+subs)/(match+subs+delet),3)
        id=round(match/(match+subs+inser+delet),3)
    
        #var_count=count_variations(feature_id,target_list)
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # computes the cov/id calculation for a segment pair
    
    def segment_id_cov(type,seg_size,seg_a,seg_b,first,feature,last):
        [match,subs,inser,delet]=[0,0,0,0]
        match type:
            case "identity":
                if first:
                    match+=seg_size[seg_a[1:]]-feature.pos_start+1
                elif last:
                    match+=feature.pos_stop
                else:
                    match+=seg_size[seg_a[1:]]
            case "substitution":
                if seg_size[seg_b[1:]]!=seg_size[seg_a[1:]]: # substitution can be between segments of different size
                    if seg_size[seg_b[1:]]>seg_size[seg_a[1:]]:
                        subs+=seg_size[seg_a[1:]]
                        inser+=seg_size[seg_b[1:]]-seg_size[seg_a[1:]]
                    elif seg_size[seg_b[1:]]<seg_size[seg_a[1:]]:
                        subs+=seg_size[seg_b[1:]]
                        delet+=seg_size[seg_a[1:]]-seg_size[seg_b[1:]]
                else:
                    subs+=seg_size[seg_a[1:]]
    
            case "insertion":
                inser+=seg_size[seg_b[1:]]
            case "deletion":
                if first:
                    delet+=seg_size[seg_a[1:]]-feature.pos_start+1
                else:
                    delet+=seg_size[seg_a[1:]]
            case "end_deletion":
                for seg in seg_a[:-1]:
                    delet+=seg_size[seg[1:]]
                delet+=feature.pos_stop
    
        return [match,subs,inser,delet,False] # check the orientation of the segment later
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # invert the given strand
    
        match strand:
            case "+":
    
            case "-":
                return "+"
    
            case default:
                return ""
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # outputs the nucleotide sequence of a list of segments, corresponding to the end of a feature
    
    def get_sequence_list_seg(list_seg,i,feature,seg_seq):
    
        for k in range(i,len(list_seg)):
            if k==len(list_seg)-1:
                del_sequence+=get_segment_sequence(seg_seq,list_seg[k])[0:feature.pos_stop]
    
                del_sequence+=get_segment_sequence(seg_seq,list_seg[k])
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # outputs the sequence of an oriented segment
    
    def get_segment_sequence(seg_seq,segment):
    
            return seg_seq[segment[1:]]
        else:
    
            return reverse_complement(seg_seq[segment[1:]])
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # outputs the reverse complement of a sequence
    
    def reverse_complement(sequence):
        sequence_rc=""
        for char in sequence:
            sequence_rc+=complement(char)
        return sequence_rc[::-1]
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # outputs the reverse complement of a nucleotide
    
    def complement(nucl):
        match nucl:
            case "A":
                return "T"
            case "C":
                return "G"
            case "G":
                return "C"
            case "T":
                return "A"
        return nucl
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # stores information about a feature and its current variation
    
        def __init__(self,feature_id,feature_type,chr,start_new,stop_new,inversion,size_diff,size_new):
    
            self.feature_id=feature_id
            self.feature_type=feature_type
            self.chr=chr
            self.start_new=start_new
            self.stop_new=stop_new
    
            self.inversion=inversion
            self.size_diff=size_diff
            self.size_new=size_new
    
            self.type=''
            self.last_seg_in_target=''
    
            self.seg_ref=list()
            self.seg_alt=list()
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # initiate a Variation object with the information on the feature it is on
    
    def create_var(feature_id,first_seg,last_seg,walk,copy_id):
    
        feature=Features[feature_id]
        # get feature paths on the original genome and on the target genome
    
        feature_path_target_genome=feature.segments_list_target[0][1]
    
        feature_path_source_genome=feature.segments_list_source
    
        inversion=detect_feature_inversion(feature_path_source_genome,feature_path_target_genome)
    
    
            feature_path_target_genome=invert_segment_list(feature_path_target_genome)
    
            start_new_genome=get_feature_start_on_target_genome_inv(last_seg,feature_id,walk,copy_id)
            stop_new_genome=get_feature_stop_on_target_genome_inv(first_seg,feature_id,walk,copy_id)
    
            size_new_genome=start_new_genome-stop_new_genome+1
    
            start_new_genome=get_feature_start_on_target_genome(first_seg,feature_id,walk,copy_id)
            stop_new_genome=get_feature_stop_on_target_genome(last_seg,feature_id,walk,copy_id)
    
            size_new_genome=stop_new_genome-start_new_genome+1
    
        size_diff=str(size_new_genome-feature.size)
    
        sequence_name=get_seg_occ(first_seg,walk,feature_id,copy_id)[0]
    
        variation=Variation(feature_id,feature.type,sequence_name,start_new_genome,stop_new_genome,inversion,size_diff,size_new_genome)
    
        return(variation,feature_path_source_genome,feature_path_target_genome)
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # reset the informations of the variation, but keep the information about the feature
    
        variation.type='' # make type enumerate
    
        variation.size_var=0
        variation.start_var=''
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # find the position of a substitution on the source and the target sequence
    
    def get_old_new_pos_substitution(feat_start,variation,start_feat_seg_target,feat,walk,copy_id):
    
        seg_pos=search_segment(variation.start_var)
        pos_old=str(int(Segments[seg_pos].start)-int(feat_start))
    
        var_start_seg=variation.start_on_target
    
        if variation.inversion:
    
            start_feat_seg_target=invert_seg(start_feat_seg_target)
    
            var_start_seg=invert_seg(var_start_seg)
    
            end_var=get_seg_occ(var_start_seg,walk,feat,copy_id)[2]
            start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id)
    
            start_var=get_seg_occ(var_start_seg,walk,feat,copy_id)[1]
            start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id)
    
        return [pos_old,pos_new] # pos_old and pos_new are the base before the change
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # find the position of an insertion on the source and the target sequence
    
    def get_old_new_pos_insertion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id):
    
        seg_pos=search_segment(variation.start_var) # start_var is the segment AFTER the insertion
    
        pos_old=str(int(Segments[seg_pos].start)-int(feat_start))
    
        start_var_seg=variation.start_var
    
        if variation.inversion:
    
            start_feat_seg_target=invert_seg(start_feat_seg_target)
    
            start_var_seg=invert_seg(start_var_seg)
    
            end_var=get_seg_occ(start_var_seg,walk,feat,copy_id)[2]+len(variation.alt) # start_var_seg is the segment AFTER the insertion
            start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id)
    
            start_var=get_seg_occ(start_var_seg,walk,feat,copy_id)[1]-len(variation.alt) # start_var_seg is the segment AFTER the insertion
            start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id)
    
        return [pos_old,pos_new] # pos_old and pos_new are the base before the change
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # find the position of a deletion on the source and the target sequence
    
    def get_old_new_pos_deletion(variation,feat_start,start_feat_seg_target,feat,walk,copy_id):
    
        seg_pos=search_segment(variation.start_var)
    
            pos_old=int(Segments[seg_pos].start)-int(feat_start)+Features[feat].pos_start-1
    
            pos_old=int(Segments[seg_pos].start)-int(feat_start)
    
                print("error with variation position",variation.inversion,"***")
    
    
        if variation.last_seg_in_target=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet. 
    
            start_var_seg=variation.last_seg_in_target
    
            if variation.inversion:
    
                start_feat_seg_target=invert_seg(start_feat_seg_target)
    
                start_var_seg=invert_seg(start_var_seg)
    
                start_var=get_seg_occ(start_var_seg,walk,feat,copy_id)[1]-1
                start_feat=get_feature_start_on_target_genome_inv(start_feat_seg_target,feat,walk,copy_id)
    
                start_var=get_seg_occ(start_var_seg,walk,feat,copy_id)[2]+1
                start_feat=get_feature_start_on_target_genome(start_feat_seg_target,feat,walk,copy_id)
    
        return [pos_old,pos_new] # pos_old and pos_new are the base before the change
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # change the variation information, but keep the feature information (the variation is on the feature)
    
    def init_new_var(variation,type,feature_path_source_genome,feature_path_target_genome,i,j,seg_seq,feature):
    
        variation.start_var=feature_path_source_genome[i]
    
            variation.start_on_target=feature_path_target_genome[j]
    
            variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
            variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
            variation.seg_ref.append(feature_path_source_genome[i])
            variation.seg_alt.append(feature_path_target_genome[j])
    
        elif type=="insertion":
            variation.ref="-"
    
            variation.alt=get_segment_sequence(seg_seq,feature_path_target_genome[j])
            variation.seg_alt.append(feature_path_target_genome[j])
    
        elif type=="deletion":
            if i==0: # if the deletion is at the start of the feature, the deletion doesnt always start at the start at the first segment : 
                #use pos_start, position of the feature on its first segment
    
                variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])[feature.pos_start-1:]
                variation.seg_ref.append(feature_path_source_genome[i])
    
            else: # else, the deletion will always start at the start of the first segment.
    
                variation.ref=get_segment_sequence(seg_seq,feature_path_source_genome[i])
                variation.seg_ref.append(feature_path_source_genome[i])
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # update the variation
    
    def continue_var(variation,seg_seq,feature_path_source_genome,feature_path_target_genome,i,j,genome_to_continue):
    
        if variation.type=="substitution":
    
            if genome_to_continue==0: # genome_to_continue allows to choose if the substitution continues for the original or the target genome, or both.
    
                variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
                variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
                variation.seg_ref.append(feature_path_source_genome[i])
                variation.seg_alt.append(feature_path_target_genome[j])
    
            elif genome_to_continue==1: # deletion
    
                variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
                variation.seg_ref.append(feature_path_source_genome[i])
    
            elif genome_to_continue==2: # insertion
    
                variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
                variation.seg_alt.append(feature_path_target_genome[j])
    
        elif variation.type=="insertion":
    
            variation.alt+=get_segment_sequence(seg_seq,feature_path_target_genome[j])
            variation.seg_alt.append(feature_path_target_genome[j])
    
        elif variation.type=="deletion":
    
            variation.ref+=get_segment_sequence(seg_seq,feature_path_source_genome[i])
            variation.seg_ref.append(feature_path_source_genome[i])
    
    
    # functions to detect inversions
    
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # gives the list of segments from dict1 that are in dict2
    
    def get_common_segments(dict1,dict2):
        list_common=[]
        for segment in dict1:
            if segment in dict2:
                list_common.append(segment)
        return list_common
    
    # check if common segments in the two dict have the same strand
    def compare_strand(dict1,dict2): # dict1 and dict2 : [seg_id]->[seg_strand]
        seg_common=get_common_segments(dict1,dict2)
    
        # for each segment in common, check if the strand is the same
    
        same_strand_count=0
        for segment in seg_common:
    
                same_strand_count+=1
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # check if the two dict have their segments in the inverted order
    
    def detect_segment_order_inversion(dict1,dict2):
        list_1_common=get_common_segments(dict1,dict2)
        list_2_common=get_common_segments(dict2,dict1) # same segments, different orders
    
        list_2_common_reversed=list(reversed(list_2_common))
    
        while i<len(list_1_common):
    
            if list_2_common_reversed[i]==list_1_common[i]:
    
        return (cpt>len(list_1_common)*0.9) # if more than 90% of the segments are on the same position when the lists are reversed, there is an inversion. 
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    
    # check if the two dict have the same segments but in different orientation
    
    def detect_orient_inversion(dict1,dict2):
        [seg_common_count,same_strand_count]=compare_strand(dict1,dict2)
    
        if same_strand_count>=seg_common_count*0.9: # if more than 90% of segments shared have the same strand, no inversion
            return [False,dict1,dict2]
    
    # takes two lists of segments for two genes, check if the first list is an inversion of the second one (if the segments in common are on the opposite strand)
    
    def detect_feature_inversion(list_1,list_2):
    
        # convert list into dict with unstranded segment id as key and strand as value 
        strand1=[seg[0] for seg in list_1]
        id1=[seg[1:] for seg in list_1]
        dict1 = {id1[i]: strand1[i] for i in range(len(strand1))}
        strand2=[seg[0] for seg in list_2]
        id2=[seg[1:] for seg in list_2]
        dict2 = {id2[i]: strand2[i] for i in range(len(strand2))}
    
        # check if we have an inversion of the orientation of the segments
    
        [strand_inversion,dict1,dict2]=detect_orient_inversion(dict1,dict2)
    
    
        # check if we have an inversion of the order of the segments
    
        segment_order_inversion=detect_segment_order_inversion(dict1,dict2)
    
        # if there we have both inversions, the gene is in an inverted region
    
        if segment_order_inversion and strand_inversion:
    
    nina.marthe_ird.fr's avatar
    nina.marthe_ird.fr committed
    # invert all the segments in a list (change the orientation)
    
    def invert_segment_list(seg_list):
        list_inverted=list()
        for seg in seg_list:
    
        return list(reversed(list_inverted))