import subprocess from Graph_gff import Segments, Features, get_feature_start_on_segment, get_feature_stop_on_segment, write_line ### functions to generate a genome's gff from the graph's gff # get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome def get_feature_start_on_genome(start_seg,feat_id): seg_start_pos=segments_on_target_genome[start_seg][1] feat_start_pos=get_feature_start_on_segment(start_seg,feat_id) start=int(seg_start_pos)+int(feat_start_pos)-1 return start # get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome def get_feature_stop_on_genome(stop_seg,feat_id): seg_start_pos=segments_on_target_genome[stop_seg][1] feat_stop_pos=get_feature_stop_on_segment(stop_seg,feat_id) stop=int(seg_start_pos)+int(feat_stop_pos)-1 return stop # functions to get the detailed gff with all the fragments of the features # get the position of a part of a feature on the complete feature (on the original genome) def get_position_on_feature(start_seg,stop_seg,feature_start_segment,feature): start_on_feature=get_segment_start_on_feature(feature_start_segment,start_seg,feature) stop_on_feature=get_segment_stop_on_feature(start_seg,stop_seg,feature,start_on_feature) position=";position="+str(start_on_feature)+"-"+str(stop_on_feature) #position=";start_position_on_feature="+str(start_on_gene)+":stop_position_on_feature="+str(stop_on_gene) return position def get_segment_start_on_feature(feature_start_segment,start_seg,feature): feature_start_pos=Segments[feature_start_segment].start+get_feature_start_on_segment(feature_start_segment,feature.id)-1 start_on_reference=Segments[start_seg].start+get_feature_start_on_segment(start_seg,feature.id)-1 start_on_feature=int(start_on_reference)-int(feature_start_pos)+1 return start_on_feature def get_segment_stop_on_feature(start_seg,stop_seg,feature,start_on_feature): start_on_new_genome=get_feature_start_on_genome(start_seg,feature.id) stop_on_new_genome=get_feature_stop_on_genome(stop_seg,feature.id) # length on new genome is the same as length on reference, as we only get the positions of conserved parts of the feature stop_on_feature=start_on_feature+(stop_on_new_genome-start_on_new_genome) # stop position : start+length return stop_on_feature # get the proportion of a part of the feature on the total length def get_proportion_of_feature_part(start_seg,stop_seg,feature): start_on_new_genome=get_feature_start_on_genome(start_seg,feature.id) stop_on_new_genome=get_feature_stop_on_genome(stop_seg,feature.id) # length on new genome is the same as length on reference, as we only get the positions of conserved parts of the feature proportion=";proportion="+str(stop_on_new_genome-start_on_new_genome+1)+"/"+str(feature.size) #proportion=";number_bases="+str(stop_new_genome-start_new_genome+1)+";total_bases="+str(feature.size) return proportion # takes two lists of segments for two genes, check if the first list is an inversion of the second one (if the segments in common are on the opposite strand) def detect_inversion(list_1,list_2): # removes strand for the lists of stranded segments list_1_unstrand=[segment_stranded[1:] for segment_stranded in list_1] list_2_unstrand=[segment_stranded[1:] for segment_stranded in list_2] [seg_common,same_strand_count]=compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand) # if there is less than 10% of strand difference among the common segments (ie more than 90% of same strand), return False, no inversion if len(seg_common)-same_strand_count<len(seg_common)/10: return [list_1_unstrand,list_2_unstrand,0] else: return [list_1_unstrand,list_2_unstrand,1] def compare_strand(list_1,list_2,list_1_unstrand,list_2_unstrand): # get the list of segments in common seg_common=[] for segment in list_1_unstrand: if segment in list_2_unstrand: seg_common.append(segment) # for each segment in common, check if the strand is the same. check index in list unstranded to get the segment in list stranded same_strand_count=0 for segment in seg_common: index_1=list_1_unstrand.index(segment) index_2=list_2_unstrand.index(segment) if list_1[index_1]==list_2[index_2]: same_strand_count+=1 return [seg_common,same_strand_count] # returns the gff line to write in the output file for the function gff_detail def create_line_detail(last_seg,feature_id,start_seg,feat_start): [stop_seg,feature,chr,strand]=[last_seg[1:],Features[feature_id],segments_on_target_genome[start_seg][0],last_seg[0:1]] # start and stop position of the feature on the genome we transfer on start_on_new_genome=get_feature_start_on_genome(start_seg,feature_id) stop_on_new_genome=get_feature_stop_on_genome(stop_seg,feature_id) proportion=get_proportion_of_feature_part(start_seg,stop_seg,feature) position=get_position_on_feature(start_seg,stop_seg,feat_start,feature) annotation=feature.annot+proportion+position output_line=chr+"\tGrAnnot\t"+feature.type+"\t"+str(start_on_new_genome)+"\t"+str(stop_on_new_genome)+"\t.\t"+strand+"\t.\t"+annotation+"\n" return output_line # outputs each fragment of the feature in a gff def gff_detail(list_seg,feature_id): # loop that goes through all the segments that have the current feature # keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the current part of the fragment, and resets the first segment present. # continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature, etc. # at the end of the loop, prints the last part of the fragment. [feat_start,seg_start,last_seg]=["","",""] # first segment of the feature, first segment of the current part of the feature, last segment in the for loop below for segment in list_seg: if segment[1:] in segments_on_target_genome: if feat_start=="": feat_start=segment[1:] if seg_start=="": # if we dont have a start, take the current segment for the start of the part of the feature seg_start=segment[1:] #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena) else: if seg_start!="": # found a stop and we have a start. print the line, reset seg_start, and keep going through the segments to find the next seg_start out_line=create_line_detail(last_seg,feature_id,seg_start,feat_start) write_line(out_line,output_detail_gff,False) seg_start="" #else: if the current segment is not in azucena but there is no start, keep looking for a start last_seg=segment if last_seg[1:] in segments_on_target_genome: out_line=create_line_detail(list_seg[-1],feature_id,seg_start,feat_start) write_line(out_line,output_detail_gff,False) # functions to get the gff with one line per feature def right_size(size,max_diff,feat): if max_diff==0: return True return not ((size>Features[feat].size*max_diff) | (size<Features[feat].size/max_diff)) def create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg): [chr,strand,feature]=[segments_on_target_genome[first_seg][0],list_seg[0][0:1],Features[feature_id]] variant_count=0 for segment in list_seg: if segment[1:] not in segments_on_target_genome: # if a segment is absent, it is either a deletion of a substitution. insertions are not considered here. variant_count+=1 annotation=feature.annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(variant_count) line=chr+" GrAnnot "+feature.type+" "+str(get_feature_start_on_genome(first_seg,feature_id))+" "+str(get_feature_stop_on_genome(last_seg,feature_id))+" . "+strand+" . "+annotation+"\n" return line # outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) : def gff_one(first_seg,last_seg,feature_id,list_seg,max_diff): if (first_seg!=''): # feature present on the target genome size_on_new_genome=get_feature_stop_on_genome(last_seg,feature_id)-get_feature_start_on_genome(first_seg,feature_id)+1 size_diff=abs(size_on_new_genome-Features[feature_id].size) if right_size(size_on_new_genome,max_diff,feature_id): line=create_line_gff_one(first_seg,last_seg,feature_id,size_diff,list_seg) write_line(line,output_gff_one,False) return size_diff else: return "bad_size" else : return "absent" def get_segments_positions_on_genome(pos_seg): global segments_on_target_genome segments_on_target_genome={} bed=open(pos_seg,'r') lines=bed.readlines() # read line by line ? bed.close() for line in lines: line=line.split() [seg,chrom,start,stop]=[line[3][1:],line[0],line[1],line[2]] if line[3][0:1]=='>': strand='+' elif line[3][0:1]=='<': strand='-' else: strand='' segments_on_target_genome[seg]=[chrom,start,stop,strand] def get_segments_sequence_and_paths(gfa): file_gfa=open(gfa,'r') lines_gfa=file_gfa.readlines() file_gfa.close() seg_seq={} paths={} for line in lines_gfa: line=line.split() if (line[0]=="S"): # get the sequence of the segment seg_id='s'+line[1] seg_seq[seg_id]=line[2] if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome path=line[6].replace(">",";>") path=path.replace("<",";<").split(';') list_path=[] for segment in path: if segment[0:1]=='>': list_path.append('+s'+segment[1:]) elif segment[0:1]=='<': list_path.append('-s'+segment[1:]) else: list_path.append('s'+segment[1:]) paths[line[3]]=list_path return [paths,seg_seq] def get_first_seg(list_seg): first_seg_found='' for segment in list_seg: if segment[1:] in segments_on_target_genome: first_seg_found=segment[1:] break return first_seg_found def get_all_features_in_gff(gff): gff=open(gff,'r') lines=gff.readlines() gff.close() list_feature=[] for line in lines: feature_id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_")) if feature_id not in list_feature: list_feature.append(feature_id) return list_feature # functions to output the stats on the transfer def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feature_id): # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok] feature_missing_segments[5].append(feature_id) if (first_seg=='') : # no segment of the feature is in the genome, the feature is missing entirely feature_missing_segments[3].append(feature_id) elif first_seg != list_seg[0][1:]: # the first segment is missing feature_missing_segments[0].append(feature_id) elif last_seg!=list_seg[-1][1:]: # the last segment is missing feature_missing_segments[2].append(feature_id) # go through all the segments, check if some are missing in the middle of the feature elif (len(list_seg)!=1) & (feature_id not in feature_missing_segments[3]): # to access the second to last element for segment in list_seg[1-(len(list_seg)-2)]: if segment[1:] not in segments_on_target_genome: feature_missing_segments[1].append(feature_id) break # go through the segments, to see if one is missing anywhere on the feature for segment in list_seg: if segment[1:] not in segments_on_target_genome: if feature_id not in feature_missing_segments[4]: feature_missing_segments[4].append(feature_id) break # if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK FOR INSERTIONS !! if feature_id not in feature_missing_segments[4]: feature_missing_segments[6].append(feature_id) def get_annot_features(list_features): list_annot_features=[] for feature in list_features: list_annot_features.append(Features[feature].note) return list_annot_features def count_hypput_total(list_annot_first): total=len(list_annot_first) count_hypput=0 for annot in list_annot_first: if ("hypothetical" in annot) | ("putative" in annot): count_hypput+=1 return [count_hypput,total] # print stats on the transfer : number of feature that have segments in different positions missing. def stats_features(feature_missing_segments): # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok] list_annot_first=get_annot_features(feature_missing_segments[0]) [hyp_put,total]=count_hypput_total(list_annot_first) print("\nthe first segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_middle=get_annot_features(feature_missing_segments[1]) [hyp_put,total]=count_hypput_total(list_annot_middle) print("a middle segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_last=get_annot_features(feature_missing_segments[2]) [hyp_put,total]=count_hypput_total(list_annot_last) print("the last segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_all=get_annot_features(feature_missing_segments[3]) [hyp_put,total]=count_hypput_total(list_annot_all) print(total,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_total=get_annot_features(feature_missing_segments[4]) [hyp_put,total]=count_hypput_total(list_annot_total) print("there is at least one segment missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_ok=get_annot_features(feature_missing_segments[6]) [hyp_put,total]=count_hypput_total(list_annot_ok) print(total ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") list_annot_features=get_annot_features(feature_missing_segments[5]) [hyp_put,total]=count_hypput_total(list_annot_features) print("there is", total,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.") # writes the gff of azucena using the gff of the graph def genome_gff(pos_seg, gff, out, gfa): print("generation of the genome's gff ") [once,detail,var,stats,plot]=[True,False,True,True,False] if var: [paths,seg_seq]=get_segments_sequence_and_paths(gfa) file_out_var = open("variations.txt", 'w') global output_variations output_variations=[0,"",file_out_var] if detail: file_out_detail = open("azucena_chr10_detail.gff", 'w') global output_detail_gff output_detail_gff=[0,"",file_out_detail] if once: file_out=open(out,'w') global output_gff_one output_gff_one=[0,"",file_out] bad_size_features=0 absent_features=0 diff_size_transfered_features=[0,0] # [count,sum], to get the average get_segments_positions_on_genome(pos_seg) list_feature_to_transfer=get_all_features_in_gff(gff) # get the list of all the features to transfer from the gff # create objects for stats on how many segments are absent in azucena, their average length, etc if stats==True: feature_missing_segments=[[],[],[],[],[],[],[]] # [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok] # the fist segment of the feature is missing - feature_missing_first # the last segment of the feature is missing - feature_missing_last # at least one middle segment of the feature is missing - feature_missing_middle # the entire feature is missing - feature_missing_all # at least one segment is missing first, last, or middle) - feature_missing_total # no segment is missing, the feature is complete - feature_ok # total number of features, with missing segments or not - feature_total for feat in list_feature_to_transfer: # for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome list_seg=Features[feat].segments_list first_seg=get_first_seg(list_seg) last_seg=get_first_seg(reversed(list_seg)) if stats==True: stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feat) # outputs the feature once in a gff, from the first to the last segment present on the new genome if once: max_diff=2 # maximum difference size (n times bigger of smaller) add=gff_one(first_seg,last_seg,feat,list_seg,max_diff) if add=="bad_size": bad_size_features+=1 elif add=="absent": absent_features+=1 else: diff_size_transfered_features[0]+=1 diff_size_transfered_features[1]+=add write_line("",output_gff_one,True) # outputs each fragment of the feature in a gff if detail: gff_detail(list_seg,feat) # insertions ! write_line("",output_detail_gff,True) # outputs the detail of variations of the feature : if var: print_variations(first_seg,last_seg,feat,paths,seg_seq) write_line("",output_variations,True) if detail: file_out_detail.close() if once: file_out.close() if var: file_out_var.close() # print stats from statistics import median, mean if stats==True: if once: print(len(Features)-(bad_size_features+absent_features),"out of",len(Features),"features are transfered.") print(bad_size_features,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.") print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.") print("average length difference of the transfered genes : ",diff_size_transfered_features[1]/diff_size_transfered_features[0]) stats_features(feature_missing_segments) # outputs the detail of the variations on the new genome for the feature def print_variations(first_seg,last_seg,feat,paths,seg_seq): if (first_seg!=''): # if the feature is not completly absent var=0 # count variations, to see if there is any feature=Features[feat] feat_start=feature.start # get the lengths of the feature, on the original genome and on the new one start_new_genome=get_feature_start_on_genome(first_seg,feat) stop_new_genome=get_feature_stop_on_genome(last_seg,feat) size_new_genome=int(stop_new_genome)-int(start_new_genome)+1 size_diff=str(size_new_genome-feature.size) # get feature paths on the original genome and on the target genome list_segfeat_azu=get_feature_path(paths,first_seg,last_seg) list_segfeat_nb=feature.segments_list # loop to go through both paths i=0 j=0 [last,last_taille,last_seq,last_start,last_in_azu]=['',0,'','',''] # check if there is an inversion and remove strands [list_segfeat_nb,list_segfeat_azu,inversion]=detect_inversion(list_segfeat_nb,list_segfeat_azu) # detect and print variations ignoring the strands while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)): if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu # is both segments are absent in the other genome, its a substitution last_in_azu=list_segfeat_azu[j] # print if we had an insertion or deletion running if last=='insertion': [pos_old,pos_new]=get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat) line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n" write_line(line,output_variations,False) var+=1 elif last=='deletion': [pos_old,pos_new]=get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i) line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n" write_line(line,output_variations,False) var+=1 last='';last_taille=0;last_seq='';last_start='' # print the substitution # substitution of segment list_segfeat_nb[i][1:] by segment list_segfeat_azu[j][1:] [pos_old,pos_new]=get_old_new_pos_substitution(feat_start,list_segfeat_nb,list_segfeat_azu,feat,i,j) if len(seg_seq[list_segfeat_nb[i]]) == len(seg_seq[list_segfeat_azu[j]]): # if the substituion is between two segment of the same size, print it size_subs=len(seg_seq[list_segfeat_nb[i]]) line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tsubstitution\t"+seg_seq[list_segfeat_nb[i]]+"\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(size_subs)+"\t"+str(pos_old)+"\t"+pos_new+"\n" else : # if the segments of the substitution have a different size, print deletion then insertion at the same position. line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+seg_seq[list_segfeat_nb[i]]+"\t-\t"+str(len(seg_seq[list_segfeat_nb[i]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n" line+=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(len(seg_seq[list_segfeat_azu[j]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n" var+=1 write_line(line,output_variations,False) var+=1;i+=1;j+=1 else: # azu segment not in nb, but nb segment in azu : insertion if last=='deletion': [pos_old,pos_new]=get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i) line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n" write_line(line,output_variations,False) var+=1;last='';last_taille=0;last_start='';last_seq='' last_in_azu=list_segfeat_azu[j] if last=='insertion': last_seq=last_seq+seg_seq[list_segfeat_azu[j]] else: last='insertion' last_seq=seg_seq[list_segfeat_azu[j]] last_start=list_segfeat_nb[i] j+=1 elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion if last=='insertion': [pos_old,pos_new]=get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat) line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n" write_line(line,output_variations,False) var+=1;last='';last_start='';last_taille=0;last_seq='' if last=='deletion': last_seq=last_seq+seg_seq[list_segfeat_nb[i]] else: last='deletion' last_start=list_segfeat_nb[i] if i==0: # if the deletion is at the start of the feature, the deletion doesnt start at the start at the first segment : #use pos_start, position of the feature on its first segment last_seq=seg_seq[list_segfeat_nb[i]][feature.pos_start-1:] else: # else, the deletion will always start at the start of the first segment. last_seq=seg_seq[list_segfeat_nb[i]] i+=1 else : # idk yet. if both segments are present in the other genome but not at the same position. probably substitution then line="weird order change" write_line(line,output_variations,False) var+=1;i+=1;j+=1 else: # segment present in both. print the running indel if there is one if last=='insertion': [pos_old,pos_new]=get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat) line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n" write_line(line,output_variations,False) var+=1 elif last=='deletion': [pos_old,pos_new]=get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i) line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n" write_line(line,output_variations,False) var+=1 last_in_azu=list_segfeat_azu[j] last='';last_taille=0;last_start='';last_seq='';i+=1;j+=1 # finish printing the current indel if last=='insertion': [pos_old,pos_new]=get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat) line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(last_taille)+"\t"+str(pos_old)+"\t"+pos_new+"\n" write_line(line,output_variations,False) var+=1 elif last=='deletion': [pos_old,pos_new]=get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i) line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n" write_line(line,output_variations,False) var+=1 # see if the end is missing for one of the two genomes if not((i>=len(list_segfeat_nb)-1) & (j>=len(list_segfeat_azu)-1)): pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1 del_sequence=get_sequence_list_seg(list_segfeat_nb,i,feature,seg_seq) length=len(del_sequence) pos_new=str(size_new_genome+1) # the deletion is at the end of the feature on the new genome line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+str(inversion)+"\t"+size_diff+"\tdeletion\t"+del_sequence+"\t-\t"+str(length)+"\t"+str(pos_old)+"\t"+pos_new+"\n" write_line(line,output_variations,False) var+=1 if var==0: line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(size_new_genome)+"\t"+size_diff+"\tno_var\t-\t-\t-\t-\t-\n" write_line(line,output_variations,False) def get_feature_path(paths,first_seg,last_seg): # find the path in azucena. first_strand=segments_on_target_genome[first_seg][3] first_seg_stranded=first_strand+first_seg last_strand=segments_on_target_genome[last_seg][3] last_seg_stranded=last_strand+last_seg id_first_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(first_seg_stranded)) id_last_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(last_seg_stranded)) first_index=min(id_first_seg,id_last_seg) last_index=max(id_last_seg,id_first_seg) list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][first_index:last_index+1] return list_segfeat_azu def get_sequence_list_seg(list_segfeat_nb,i,feature,seg_seq): del_sequence="" for k in range(i,len(list_segfeat_nb)): if k==len(list_segfeat_nb): del_sequence+=seg_seq[list_segfeat_nb[k]][0:feature.pos_stop] else: del_sequence+=seg_seq[list_segfeat_nb[k]] return del_sequence def get_old_new_pos_substitution(feat_start,list_segfeat_nb,list_segfeat_azu,feat,i,j): pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1 start_feat=get_feature_start_on_genome(list_segfeat_azu[0],feat) start_var=int(segments_on_target_genome[list_segfeat_azu[j-1]][2])+1 pos_new=str(start_var-start_feat+1) return [pos_old,pos_new] def get_old_new_pos_insertion(last_start,feat_start,list_segfeat_azu,feat): pos_old=str(int(Segments[last_start].start)-int(feat_start)+1) start_feat=get_feature_start_on_genome(list_segfeat_azu[0],feat) start_var=int(segments_on_target_genome[last_start][1])-1 pos_new=str(start_var-start_feat+1) return [pos_old,pos_new] def get_old_new_pos_deletion(last_start,feat_start,list_segfeat_azu,feat,last_in_azu,i): if i==0: pos_old=int(Segments[last_start].start)-int(feat_start)+1+Features[feat].pos_start else: pos_old=int(Segments[last_start].start)-int(feat_start)+1 if pos_old<0: pos_old=0 if last_in_azu=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet. pos_new="1" else: start_feat=get_feature_start_on_genome(list_segfeat_azu[0],feat) start_var=int(segments_on_target_genome[last_in_azu][2])+1 pos_new=str(start_var-start_feat+1) return [pos_old,pos_new] # not used. def get_list_segments_missing(list_seg,segments_on_target_genome): segments_missing=[] for segment in list_seg: if segment[1:] not in segments_on_target_genome: segments_missing.append(Segments[segment[1:]]) return segments_missing # takes a feature and a feature type, returns a list of child features that have the wanted type. currently not used. def get_child_list(feature,child_type): if type=="": return feature.childs list_childs=[] for child in feature.childs: if Features[child].type==child_type: list_childs.append(child) return list_childs