diff --git a/Graph_gff.py b/Graph_gff.py index 6af09f4d6c1ad66a65899c8ee3bad2d80e759b1c..29ccbcd37454ed701a2029beb5858d451796c6d2 100644 --- a/Graph_gff.py +++ b/Graph_gff.py @@ -6,113 +6,113 @@ global Segments Segments={} Features={} +global print_graph_gff +print_graph_gff=[0,"",""] + +# write in output file +def write_line(line,force): + print_graph_gff[1]+=line + print_graph_gff[0]+=1 + if (print_graph_gff[0]>999) | (force==True): + print_graph_gff[2].write(print_graph_gff[1]) + print_graph_gff[0]=0 + print_graph_gff[1]="" # functions to create the segments and the features #Â create a segment with its first feature -def init_seg(line,feature): - line=line.split() - seg_id=line[3][1:] - chr=line[0] - start=line[1] - stop=line[2] - +def init_seg(line,feature_id,segment_id,strand): + [chr,start,stop]=[line[0],line[1],line[2]] #Â add the current feature to the list of features that are on the segment - feature_strand=line[10] - feature_stranded=feature_strand+feature - feat=list() - feat.append(feature_stranded) - - # create the segment - Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop) - + feature_stranded=strand+feature_id + feature_list=[feature_stranded] + # create the segment, store it in the Segments dict + Segments[segment_id]=Class.Segment(segment_id,feature_list,chr,start,stop) # create a feature with the first segment its on -def init_feature(line): - line=line.split() - feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_") - type=line[6] - annot=line[12] - chr=line[4] - start=line[7] - stop=line[8] - childs=list() - - # add the current segment to the list of segments that have the feature - seg=line[3][1:] - strand=line[10] - seg_stranded=strand+seg - segments_list=list() - segments_list.append(seg_stranded) - +def init_feature(line,feature_id,segment_id,strand): + [type,annot,chr,start,stop,childs]=[line[6],line[12],line[4],line[7],line[8],[]] parent=get_parent(annot,feature_id) - - #Â create the feature + # add the current segment to the list of segments that have the feature + segment_stranded=strand+segment_id + segments_list=[segment_stranded] + #Â create the feature, store it in the dict Features Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list) + set_note(feature_id) - -# if there is a parent, get the id of the parent feature and add the child to the parent +# if there is a parent, returns the id of the parent feature and add the child to the parent def get_parent(annot,feature_id): - #Â if the current feature has a parent, add the current feature in the childs list of its parent if annot.split(";")[1].split("=")[0]=="Parent": # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1 parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_") add_child(parent,feature_id) - elif annot.split(";")[2].split("=")[0]=="Parent": # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2 parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_") add_child(parent,feature_id) else: parent="" - return parent - #Â add a feature to an existing segment def add_feature(seg,new_feature,strand): new_feature_stranded=strand+new_feature if new_feature_stranded not in Segments[seg].features: Segments[seg].features.append(new_feature_stranded) - #Â add a child feature to an existing feature def add_child(feat,new_child): - if feat in Features.keys(): # if the parent feature exists - if new_child not in Features[feat].childs: - Features[feat].childs.append(new_child) - - + if (feat in Features.keys()) & (new_child not in Features[feat].childs): # if the parent feature exists + Features[feat].childs.append(new_child) + #Â add a segment to an existing feature def add_seg(feat,new_seg,strand): seg_stranded=strand+new_seg if seg_stranded not in Features[feat].segments_list: Features[feat].segments_list.append(seg_stranded) +# add the position of the feature on its first and last segment +def add_pos(feature): + feature.pos_start=get_start(feature.segments_list[0][1:],feature.id) + feature.pos_stop=get_stop(feature.segments_list[-1][1:],feature.id) + +def load_feature_intersect(line,feature_id,segment_id,strand): + if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list + init_feature(line, feature_id, segment_id,strand) + else: # if it exists, add the current segment to the list of segments that have the existing feature + add_seg(feature_id,segment_id,strand) + +def load_segment_intersect(line,feature_id,segment_id,strand): + if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list + init_seg(line, feature_id,segment_id,strand) + else: # if it exists, add the current feature to the list of features on the existing segment + add_feature(segment_id,feature_id,strand) -# create a note for the child features that do not have annotation. -def set_note(id): + +# create a note for the child features that do not have annotation. not clean. fct for getting parent ? +def set_note(feature_id): # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features. # in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent). - feat=Features[id] - if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation. - feat.note=feat.annot.split(';')[-1] + feature=Features[feature_id] + if feature.type=="gene": # if the feature is a gene, the note is the last field of its annotation. + feature.note=feature.annot.split(';')[-1] else: #Â else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation. # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene. # this is because for example the parent of an exon is the mrna, not the gene itself, so we need to go up until we find the gene. - curent=feat.parent + curent=feature.parent annot_found=False while annot_found==False: if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature note=Features[curent].annot.split(';')[-1] - feat.note=note + feature.note=note annot_found=True else: # if we didn't find the gene, we go up to the current feature's parent until we find it curent=Features[Features[curent].parent].id # create all the Segment and Feature objects in the dictionnaries Segments and Features -def create_seg_feat(intersect_path): +def load_intersect(intersect_path): + print("loading the intersect file") # open the file with the intersect between the segments and the gff file = open(intersect_path, 'r') @@ -120,31 +120,19 @@ def create_seg_feat(intersect_path): file.close() for line in lines: # for each line in the intersect file + line=line.split() # get the ids for the dictionnaries' keys - feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_") - segment_id=line.split()[3][1:] - - if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list - init_feature(line) - else: # if it exists, add the current segment to the list of segments that have the existing feature - strand=line.split()[10] - add_seg(feature_id,segment_id,strand) + feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_") + segment_id=line[3][1:] + strand=line[10] - if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list - init_seg(line, feature_id) - - else: # if it exists, add the current feature to the list of features on the existing segment - strand=line.split()[10] - add_feature(segment_id,feature_id,strand) - - # for all the features, add the note (information on the function of the feature), and the positions on the first and last seg. - # cant always do it before because for that i need to have all the parents in the dict Features, and all the segments in the list segments_list for each feature. - for feat_id in Features: - feat=Features[feat_id] - set_note(feat.id) - feat.pos_start=get_start(feat.segments_list[0][1:],feat.id) - feat.pos_stop=get_stop(feat.segments_list[-1][1:],feat.id) + load_feature_intersect(line,feature_id,segment_id,strand) + load_segment_intersect(line,feature_id,segment_id,strand) + # for all the features, add the position of the feature on its first and last segment. + # cant do it before because for that i need to have all the segments in the list segments_list for each feature. + for feat_id in Features: + add_pos(Features[feat_id]) @@ -162,7 +150,6 @@ def get_start(seg_id,feat_id): result=f.start-s.start+1 return result - # get the feature's stop position on the segment def get_stop(seg_id,feat_id): s=Segments[seg_id] @@ -173,49 +160,39 @@ def get_stop(seg_id,feat_id): result=f.stop-s.start+1 return result +# generates the annotation for the gff of the graph, with the rank of the segment in the feature +def make_annot_graph(feature,segment,strand): + segment_stranded=strand+segment + # get the rank and the total number of ocurrences for the feature + rank=str(feature.segments_list.index(segment_stranded)+1) + total=str(len(feature.segments_list)) + # create the annotation with the rank information + annotation=feature.annot+";Rank_occurrence="+rank+";Total_occurrences="+total + return annotation + +def create_line_graph_gff(feature_stranded,segment): + [strand,feature_id]=[feature_stranded[0:1],feature_stranded[1:]] + feature=Features[feature_id] + + annotation=make_annot_graph(feature,segment,strand) + type=feature.type + start=get_start(segment,feature_id) + stop=get_stop(segment,feature_id) + line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n" + return line #Â go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection def graph_gff(file_out): print("generation of the graph's gff") file_out = open(file_out, 'w') - string_out="" - cpt_out=0 + print_graph_gff[2]=file_out for segment in Segments: - # get the list of the features on the segment - features_seg=Segments[segment].features - - #Â go through all the segment's features, and print the gff line for each - for feature_stranded in features_seg: - strand=feature_stranded[0:1] - feature=feature_stranded[1:] - #Â feature[feature] - - #[strand,feature]=[feature_stranded[0:1],feature_stranded[1:]] - segment_stranded=strand+segment - type=Features[feature].type - start=get_start(segment,feature) - stop=get_stop(segment,feature) - - # get the rank and the total number of ocurrences for the feature - rank=str(Features[feature].segments_list.index(segment_stranded)+1) - total=str(len(Features[feature].segments_list)) - - # create the annotation with the rank information - annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total - - # write the gff line in the output file - line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n" - string_out+=line - cpt_out+=1 - #file_out.write(line) - - if cpt_out==1000: - file_out.write(string_out) - string_out="" - cpt_out=0 - #Â faire un fct print - file_out.write(string_out) + features_seg=Segments[segment].features # get the list of the features on the segment + for feature_stranded in features_seg: #Â go through all the segment's features, and print the gff line for each + line = create_line_graph_gff(feature_stranded,segment) + write_line(line,False) + write_line("",True) file_out.close() @@ -223,4 +200,3 @@ def graph_gff(file_out): -