import ClassSegFeat as Class # functions to create the segments and the features # create segment def init_seg(line,feature): line=line.split() seg_id=line[3][1:] chr=line[0] start=line[1] stop=line[2] # add the current feature (stranded) to the list of features that are on the segment feature_strand=line[10] feature_stranded=feature_strand+feature feat=list() feat.append(feature_stranded) Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop) # create feature def init_feature(line): line=line.split() feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_") type=line[6] annot=line[12] chr=line[4] start=line[7] stop=line[8] childs=list() # add the current segment (stranded) to the list of segments that have the feature seg=line[3][1:] strand=line[10] seg_stranded=strand+seg segments_list=list() segments_list.append(seg_stranded) # if the current feature as a parent, add the feature un the childs list of its parent if annot.split(";")[1].split("=")[0]=="Parent": # for annotations like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1 parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_") add_child(parent,feature_id) elif annot.split(";")[2].split("=")[0]=="Parent": # for annotations like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2 parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_") add_child(parent,feature_id) else: parent="" Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list) # add a feature (stranded) to an existing segment def add_feature(seg,new_feature,strand): new_feature_stranded=strand+new_feature if new_feature_stranded not in Segments[seg].features: Segments[seg].features.append(new_feature_stranded) # add a child to an existing feature def add_child(feat,new_child): if feat in Features.keys(): if new_child not in Features[feat].childs: Features[feat].childs.append(new_child) # add a segment (stranded) to an existing feature def add_seg(feat,new_seg,strand): seg_stranded=strand+new_seg if seg_stranded not in Features[feat].segments_list: Features[feat].segments_list.append(seg_stranded) # create all the Segment and Feature objects in the dictionnaries Segments and Features def create_seg_feat(intersect_path): global Features global Segments Segments={} Features={} # open the intersect between the segments and the gff file = open(intersect_path, 'r') lines=file.readlines() for line in lines: # get the ids for the dictionnaries' keys feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_") segment_id=line.split()[3][1:] if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list init_feature(line) else: # add the current segment to the list of segments that have the existing feature strand=line.split()[10] add_seg(feature_id,segment_id,strand) if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list init_seg(line, feature_id) else: # add the current feature to the existing segment strand=line.split()[10] add_feature(segment_id,feature_id,strand) file.close() # functions to generate the graph's gff from the segments and features created with create_seg_feat # get the feature's start position on the segment def get_start(seg,feat): s=Segments[seg] f=Features[feat] if s.start>f.start: result=1 else: result=f.start-s.start return result # get the feature's stop position on the segment def get_stop(seg,feat): s=Segments[seg] f=Features[feat] if s.stop<f.stop: result=s.size else: result=f.stop-s.start return result # go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection def graph_gff(file_out): print("generation of the graph's gff") file_out = open(file_out, 'w') for segment in Segments: # get the list of the features on the segment features_seg=Segments[segment].features # go through all these features, and print the gff line for each for feature_stranded in features_seg: strand=feature_stranded[0:1] feature=feature_stranded[1:] segment_stranded=strand+segment type=Features[feature].type #start=max(1,get_start(segment,feature)) start=get_start(segment,feature) stop=get_stop(segment,feature) # get the rank and the total number of ocurrences for the feature rank=str(Features[feature].segments_list.index(segment_stranded)+1) total=str(len(Features[feature].segments_list)) # create the annotation with the rank information annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total # generate the gff line ligne=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n" #print(ligne) file_out.write(ligne) file_out.close() # functions to generate a genome's gff from the graph's gff # get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome def get_start_2(seg_start, seg_ac, feat_id): s_start=seg_ac[seg_start][1] f_start=get_start(seg_start,feat_id) start=int(s_start)+int(f_start) return start # get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome def get_stop_2(seg_stop, seg_ac, feat_id): s_start=seg_ac[seg_stop][1] f_stop=get_stop(seg_stop,feat_id) stop=int(s_start)+int(f_stop) return stop # get the position of a piece of a feature on the complete feature (on the original genome) def get_position(seg_start,seg_stop,gene_start,feature,seg_ac): # start position of the complete gene on the reference start_gene_start=Segments[gene_start].start+get_start(gene_start,feature.id)-1 # start position of the piece of the feature on the reference start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1 # start and stop position of the feature on azucena, to get the length of the piece of the feature start_azu=get_start_2(seg_start,seg_ac,feature.id) stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) # start position of the piece of the feature on the complete feature start_gene=int(start_first_seg)-int(start_gene_start)+1 # stop position : start+length stop_gene=start_gene+(stop_azu-start_azu) position=";position="+str(start_gene)+"-"+str(stop_gene) #position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene) return position # get the proportion of the piece of the feature on the total length def get_proportion(seg_start,seg_stop,seg_ac,feature): start_azu=get_start_2(seg_start,seg_ac,feature.id) # start position of the feature on azucena stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) # stop position of the feature on azucena proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size) #proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size) return proportion # returns the gff line to write in the output file def write_line(list_seg,i,feat,seg_start,gene_start,seg_ac): seg_stop=list_seg[i-1][1:] feature=Features[feat] chr=seg_ac[seg_start][0] strand=list_seg[i-1][0:1] # start and stop position of the feature on azucena start_azu=get_start_2(seg_start,seg_ac,feature.id) stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) proportion=get_proportion(seg_start,seg_stop,seg_ac,feature) position=get_position(seg_start,seg_stop,gene_start,feature,seg_ac) annotation=feature.annot+proportion+position out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n" return out_line # writes the gff of azucena using the gff of the graph def genome_gff(pos_seg, gff, out): print("generation of the genome's gff ") # create a dictionnary with the positions of the segments on azucena seg_ac={} bed=open(pos_seg,'r') lines=bed.readlines() for line in lines: line=line.split() s_id=line[3][1:] ch=line[0] start=line[1] stop=line[2] seg_ac[s_id]=list([ch,start,stop]) bed.close() gff=open(gff,'r') file_out = open(out, 'w') lines=gff.readlines() # get the list of all the features to transfer list_feature=list() for line in lines: id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_")) if id not in list_feature: list_feature.append(id) # stats on how many segments are absent in azucena, their average length, etc stats=False if stats==True: seg_first=list() seg_middle=list() seg_last=list() seg_total=list() seg_ok=list() seg_entier=list() feature_missing_first=list() feature_missing_middle=list() feature_missing_last=list() feature_missing_all=list() feature_missing_total=list() # for each feature, get list of the segments where it is for feat in list_feature: list_seg=Features[feat].segments_list size_list=len(list_seg) # stats on how many segments are absent in azucena, their average length, etc if stats==True: for segment in list_seg: seg_len=Segments[segment[1:]].size if segment[1:] not in seg_ac: seg_total.append(seg_len) if feat not in feature_missing_total: feature_missing_total.append(feat) if segment==list_seg[0]: if segment==list_seg[size_list-1]: seg_entier.append(seg_len) if feat not in feature_missing_all: feature_missing_all.append(feat) else: seg_first.append(seg_len) if feat not in feature_missing_first: feature_missing_first.append(feat) elif segment==list_seg[size_list-1]: seg_last.append(seg_len) if feat not in feature_missing_last: feature_missing_last.append(feat) else: seg_middle.append(seg_len) if feat not in feature_missing_middle: feature_missing_middle.append(feat) else: seg_ok.append(seg_len) gene_start="empty" seg_start="empty" seg_stop="empty" # loop that goes through all the segments that have the current feature # keeps the first segment present in azucena found, and when it finds a segment absent in azucena, prints the piece of the fragment, and resets the first segment present. # at the end of the loop, prints the last piece of the fragment. for i in range(0,size_list): if list_seg[i][1:] in seg_ac: if gene_start=="empty": gene_start=list_seg[i][1:] if seg_start=="empty": seg_start=list_seg[i][1:] #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena) else: if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start out_line=write_line(list_seg,i,feat,seg_start,gene_start,seg_ac) file_out.write(out_line) seg_start="empty" seg_stop="empty" #else: if the current segment is not in azucena but there is no start, keep looking for a start # print the last piece of the feature in the end if list_seg[i][1:] in seg_ac: out_line=write_line(list_seg,i+1,feat,seg_start,gene_start,seg_ac) file_out.write(out_line) file_out.close() gff.close() # print stats from statistics import median, mean if stats==True: print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first)) print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle)) multiple_3=0 for seg in seg_middle: if seg%3==0:multiple_3+=1 print(multiple_3,"segments missing in the middle of a feature, of length=3k") print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last)) print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier)) print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total)) print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok)) print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(117+5)/153,2),"% hypothetical or putative.") print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(180+3)/263,2),"% hypothetical or putative.") print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(115+7)/156,2),"% hypothetical or putative.") print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(314+24)/404,2),"% hypothetical or putative.") print("there is at least one segment missing", len(feature_missing_total) ,"features, including",round(100*(606+36)/817,2),"% hypothetical or putative.") print("there is", len(list_feature) ,"features in total, including",round(100*(2108+171)/3510,2),"% hypothetical or putative.") feat_miss_first=open('features_missing_first.txt','w') feat_miss_first.write("features missing the first segment on azucena :\n") for first in feature_missing_first : feat_miss_first.write(Features[first].annot) feat_miss_first.write("\n") feat_miss_first.close() feat_miss_middle=open('features_missing_middle.txt','w') feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n") for middle in feature_missing_middle : feat_miss_middle.write(Features[middle].annot) feat_miss_middle.write("\n") feat_miss_middle.close() feat_miss_last=open('features_missing_last.txt','w') feat_miss_last.write("\nfeatures missing the last segment on azucena :\n") for last in feature_missing_last : feat_miss_last.write(Features[last].annot) feat_miss_last.write("\n") feat_miss_last.close() feat_miss=open('features_missing.txt','w') feat_miss.write("\nfeatures missing entirely on azucena :\n") for entier in feature_missing_all : feat_miss.write(Features[entier].annot) feat_miss.write("\n") feat_miss.close() feat_miss_total=open('features_missing_total.txt','w') feat_miss_total.write("\nfeatures missing a segment on azucena :\n") for total in feature_missing_total : feat_miss_total.write(Features[total].annot) feat_miss_total.write("\n") feat_miss_total.close() feat_all=open('all_features.txt','w') for feat in list_feature: feat_all.write(Features[feat].annot) feat_all.write("\n") feat_all.close() # writes the gff of azucena using the gff of the graph def genome_gff_test(pos_seg, gff, out): print("generation of the genome's gff ") # create a dictionnary with the positions of the segments on azucena seg_ac={} bed=open(pos_seg,'r') lines=bed.readlines() for line in lines: line=line.split() s_id=line[3][1:] ch=line[0] start=line[1] stop=line[2] seg_ac[s_id]=list([ch,start,stop]) bed.close() gff=open(gff,'r') file_out = open(out, 'w') lines=gff.readlines() # get the list of all the features to transfer list_feature=list() for line in lines: id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_")) if id not in list_feature: list_feature.append(id) # for each feature, get list of the segments where it is for feat in list_feature: list_seg=Features[feat].segments_list size_list=len(list_seg) gene_start="empty" seg_start="empty" seg_stop="empty" # prints all gene fragments. for i in range(0,size_list): if (list_seg[i][1:] in seg_ac) & (1==1): feature=Features[feat] seg_start=list_seg[i][1:] seg_stop=list_seg[i][1:] chr=seg_ac[seg_start][0] start_azu=get_start_2(seg_start,seg_ac,feature.id) stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) out_line=chr+" "+str(start_azu)+" "+str(stop_azu)+" "+seg_start+" "+seg_stop+" "+feature.id+"\n" file_out.write(out_line) file_out.close() gff.close()