scripts python pour générer les gff du graphe et des génomes, pour l'ancienne...

scripts python pour générer les gff du graphe et des génomes, pour l'ancienne version du graphe (minigraph), issus du jupyternotebook

scripts python pour générer les gff du graphe et des génomes, pour l'ancienne...
c8b1d5c1 · NMarthe · a750b3ed · c8b1d5c1 · c8b1d5c1 · c8b1d5c1
Commit c8b1d5c1 authored Apr 21, 2023 by NMarthe
--- a/ClassSegFeat.py
+++ b/ClassSegFeat.py
+class Segment:
+    def __init__(self,id,feature,chr,start,stop):
+        self.id=id
+        self.features=feature # list of the features on this segment. if thre feature is on the + strand, it will be named '+feature_id', else '-feature_id'
+        # position on the reference genome
+        self.chr=chr
+        self.start=int(start)
+        self.stop=int(stop)
+        self.size=self.stop-self.start+1
+    def __str__(self):
+        return f"id={self.id}, position sur nipponbare={self.chr}:{self.start}-{self.stop}, size={self.size}, features={self.features}"
+class Feature:
+    def __init__(self,id,type,chr,start,stop,annot,childs,parent,seg):
+        self.id=id
+        self.type=type
+        # position on the reference genome
+        self.chr=chr
+        self.start=int(start)
+        self.stop=int(stop)
+        self.size=int(stop)-int(start)+1
+        self.annot=annot
+        self.childs=childs # liste of childs features (exons, cds, etc)
+        self.parent=parent
+        self.segments_list=seg # list of oriented segment on which the feature is (-s1/+s1, depending on the strand)
+    def __str__(self):
+        if self.parent=="":
+            return f"id={self.id}, type={self.type}, segments={self.segments_list}, position sur la référence={self.chr}:{self.start}-{self.stop}, childs={self.childs}, annotation={self.annot}"
+        else:
+            return f"id={self.id}, type={self.type}, segments={self.segments_list}, position sur la référence={self.chr}:{self.start}-{self.stop}, parent={self.parent}, childs={self.childs}, annotation={self.annot}"
\ No newline at end of file
--- a/Functions.py
+++ b/Functions.py
+import ClassSegFeat as Class
+# functions to create the segments and the features
+# create segment
+def init_seg(line,feature):
+    line=line.split()
+    seg_id=line[3][1:]
+    chr=line[0]
+    start=line[1]
+    stop=line[2]
+    # add the current feature (stranded) to the list of features that are on the segment
+    feature_strand=line[9]
+    feature_stranded=feature_strand+feature
+    feat=list()
+    feat.append(feature_stranded)
+    Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
+# create feature
+def init_feature(line):
+    line=line.split()
+    feature_id=line[11].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
+    type=line[5]
+    annot=line[11]
+    chr=line[4]
+    start=line[6]
+    stop=line[7]
+    childs=list()
+    # add the current segment (stranded) to the list of segments that have the feature
+    seg=line[3][1:]
+    strand=line[9]
+    seg_stranded=strand+seg
+    segments_list=list()
+    segments_list.append(seg_stranded)
+    # if the current feature as a parent, add the feature un the childs list of its parent
+    if annot.split(";")[1].split("=")[0]=="Parent":
+    # for annotations like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
+        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
+        add_child(parent,feature_id)
+    elif annot.split(";")[2].split("=")[0]=="Parent":
+    # for annotations like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
+        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
+        add_child(parent,feature_id)
+    else: parent=""
+    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
+# add a feature (stranded) to an existing segment
+def add_feature(seg,new_feature,strand):
+    new_feature_stranded=strand+new_feature
+    if new_feature_stranded not in Segments[seg].features:
+        Segments[seg].features.append(new_feature_stranded)
+# add a child to an existing feature
+def add_child(feat,new_child):
+    if feat in Features.keys():
+        if new_child not in Features[feat].childs:
+            Features[feat].childs.append(new_child)
+# add a segment (stranded) to an existing feature
+def add_seg(feat,new_seg,strand):
+    seg_stranded=strand+new_seg
+    if seg_stranded not in Features[feat].segments_list:
+        Features[feat].segments_list.append(seg_stranded)
+# create all the Segment and Feature objects in the dictionnaries Segments and Features
+def create_seg_feat(intersect_path):    
+    global Features
+    global Segments
+    Segments={}
+    Features={}
+    # open the intersect between the segments and the gff
+    file = open(intersect_path, 'r')
+    lines=file.readlines()
+    for line in lines:
+        # get the ids for the dictionnaries' keys
+        feature_id=line.split()[11].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
+        segment_id=line.split()[3][1:]
+        if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
+            init_feature(line)
+        else: # add the current segment to the list of segments that have the existing feature
+            strand=line.split()[9]
+            add_seg(feature_id,segment_id,strand)
+        if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
+            init_seg(line, feature_id)
+        else: # add the current feature to the existing segment
+            strand=line.split()[9]
+            add_feature(segment_id,feature_id,strand)
+    file.close()
+    # functions to get the gff of the graph from the segments and features generated with the intersect file
+# get the feature's start position on the segment
+def get_start(seg,feat):
+    s=Segments[seg]
+    f=Features[feat]
+    if s.start>f.start:
+        result=1
+    else:
+        result=f.start-s.start
+    return result
+# get the feature's stop position on the segment
+def get_stop(seg,feat):
+    s=Segments[seg]
+    f=Features[feat]    
+    if s.stop<f.stop:
+        result=s.size
+    else:
+        result=f.stop-s.start
+    return result
+# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
+def graph_gff(file_out):
+    print("generation of the graph's gff")
+    file_out = open(file_out, 'w')
+    for segment in Segments:
+        # get the list of the features on the segment
+        features_seg=Segments[segment].features
+        # go through all these features, and print the gff line for each
+        for feature_stranded in features_seg:
+            strand=feature_stranded[0:1]
+            feature=feature_stranded[1:]
+            segment_stranded=strand+segment
+            type=Features[feature].type
+            start=max(1,get_start(segment,feature))
+            stop=get_stop(segment,feature)
+            # get the rank and the total number of ocurrences for the feature
+            rank=str(Features[feature].segments_list.index(segment_stranded)+1)
+            total=str(len(Features[feature].segments_list))
+            # create the annotation with the rank information
+            annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
+            # generate the gff line
+            ligne=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
+            #print(ligne)
+            file_out.write(ligne)
+    file_out.close()
+    # get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
+def get_start_2(seg_start, seg_ac, feat_id):
+    s_start=seg_ac[seg_start][1]
+    f_start=get_start(seg_start,feat_id)
+    start=int(s_start)+int(f_start)
+    return start
+# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
+def get_stop_2(seg_stop, seg_ac, feat_id):
+    s_start=seg_ac[seg_stop][1]
+    f_stop=get_stop(seg_stop,feat_id)
+    stop=int(s_start)+int(f_stop)
+    return stop
+# get the position of a piece of a feature on the complete feature (on the original genome)
+def get_position(seg_start,seg_stop,gene_start,feature,seg_ac):
+    # start position of the complete gene on the reference
+    start_gene_start=Segments[gene_start].start+get_start(gene_start,feature.id)-1
+    # start position of the piece of the feature on the reference
+    start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
+    # start and stop position of the feature on azucena, to get the length of the piece of the feature
+    start_azu=get_start_2(seg_start,seg_ac,feature.id) 
+    stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
+    # start position of the piece of the feature on the complete feature
+    start_gene=int(start_first_seg)-int(start_gene_start)+1
+    # stop position : start+length
+    stop_gene=start_gene+(stop_azu-start_azu)
+    position=";position="+str(start_gene)+"-"+str(stop_gene)
+    #position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
+    return position
+# get the proportion of the piece of the feature on the total length
+def get_proportion(seg_start,seg_stop,seg_ac,feature):
+    start_azu=get_start_2(seg_start,seg_ac,feature.id) # start position of the feature on azucena
+    stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) # stop position of the feature on azucena
+    proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
+    #proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
+    return proportion
+# returns the gff line to write in the output file
+def write_line(list_seg,i,feat,seg_start,gene_start,seg_ac):
+    seg_stop=list_seg[i-1][1:]
+    feature=Features[feat]
+    chr=seg_ac[seg_start][0]
+    strand=list_seg[i-1][0:1]
+    # start and stop position of the feature on azucena
+    start_azu=get_start_2(seg_start,seg_ac,feature.id) 
+    stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
+    proportion=get_proportion(seg_start,seg_stop,seg_ac,feature)
+    position=get_position(seg_start,seg_stop,gene_start,feature,seg_ac)
+    annotation=feature.annot+proportion+position
+    out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
+    return out_line
+def azu(pos_seg, gff, out):
+    print("generation of azucena's gff ")
+    # create a dictionnary with the positions of the segments on azucena
+    seg_ac={}
+    bed=open(pos_seg,'r')
+    lines=bed.readlines()
+    for line in lines:
+        line=line.split()
+        s_id=line[3]
+        ch=line[0]
+        start=line[1]
+        stop=line[2]
+        seg_ac[s_id]=list([ch,start,stop])
+    bed.close()
+    gff=open(gff,'r')
+    file_out = open(out, 'w')
+    lines=gff.readlines()
+    # get the list of all the features to transfer
+    list_feature=list()
+    for line in lines:
+        id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
+        if id not in list_feature:
+            list_feature.append(id)
+    # stats on how many segments are absent in azucena, their average length, etc
+    stats=False
+    if stats==True:
+        seg_first=list()
+        seg_middle=list()
+        seg_last=list()
+        seg_total=list()
+        seg_ok=list()
+        seg_entier=list()
+        feature_missing_first=list()
+        feature_missing_middle=list()
+        feature_missing_last=list()
+        feature_missing_all=list()
+        feature_missing_total=list()
+    # for each feature, get list of the segments where it is
+    for feat in list_feature:
+        list_seg=Features[feat].segments_list
+        size_list=len(list_seg)
+        # stats on how many segments are absent in azucena, their average length, etc
+        if stats==True:
+            for segment in list_seg:
+                seg_len=Segments[segment[1:]].size
+                if segment[1:] not in seg_ac:
+                    seg_total.append(seg_len)
+                    if feat not in feature_missing_total:
+                        feature_missing_total.append(feat)
+                    if segment==list_seg[0]:
+                        if segment==list_seg[size_list-1]:
+                            seg_entier.append(seg_len)
+                            if feat not in feature_missing_all:
+                                feature_missing_all.append(feat)
+                        else: 
+                            seg_first.append(seg_len)
+                            if feat not in feature_missing_first:
+                                feature_missing_first.append(feat)
+                    elif segment==list_seg[size_list-1]:
+                        seg_last.append(seg_len)
+                        if feat not in feature_missing_last:
+                            feature_missing_last.append(feat)
+                    else:
+                        seg_middle.append(seg_len)
+                        if feat not in feature_missing_middle:
+                            feature_missing_middle.append(feat)
+                else: 
+                    seg_ok.append(seg_len)
+        gene_start="empty"
+        seg_start="empty"
+        seg_stop="empty"
+        # loop that goes through all the segments that have the current feature
+        # keeps the first segment present in azucena found, and when it finds a segment absent in azucena, prints the piece of the fragment, and resets the first segment present.
+        # at the end of the loop, prints the last piece of the fragment.
+        for i in range(0,size_list):
+            if list_seg[i][1:] in seg_ac:
+                if gene_start=="empty":
+                    gene_start=list_seg[i][1:]
+                if seg_start=="empty":
+                    seg_start=list_seg[i][1:]
+                #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
+            else:
+                if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
+                    out_line=write_line(list_seg,i,feat,seg_start,gene_start,seg_ac)
+                    file_out.write(out_line)
+                    seg_start="empty"
+                    seg_stop="empty"
+                #else: if the current segment is not in azucena but there is no start, keep looking for a start
+        # print the last piece of the feature in the end
+        if list_seg[i][1:] in seg_ac:
+            out_line=write_line(list_seg,i+1,feat,seg_start,gene_start,seg_ac)
+            file_out.write(out_line)
+    file_out.close()
+    gff.close() 
+    # print stats
+    from statistics import median, mean
+    if stats==True:
+        print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
+        print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
+        multiple_3=0
+        for seg in seg_middle:
+            if seg%3==0:multiple_3+=1
+        print(multiple_3,"segments missing in the middle of a feature, of length=3k")
+        print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
+        print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
+        print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
+        print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
+        print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(117+5)/153,2),"% hypothetical or putative.")
+        print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(180+3)/263,2),"% hypothetical or putative.")
+        print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(115+7)/156,2),"% hypothetical or putative.")
+        print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(314+24)/404,2),"% hypothetical or putative.")
+        print("there is at least one segment missing", len(feature_missing_total) ,"features, including",round(100*(606+36)/817,2),"% hypothetical or putative.")
+        print("there is", len(list_feature) ,"features in total, including",round(100*(2108+171)/3510,2),"% hypothetical or putative.")
+        feat_miss_first=open('features_missing_first.txt','w')
+        feat_miss_first.write("features missing the first segment on azucena :\n")
+        for first in feature_missing_first :
+            feat_miss_first.write(Features[first].annot)
+            feat_miss_first.write("\n")
+        feat_miss_first.close()
+        feat_miss_middle=open('features_missing_middle.txt','w')
+        feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
+        for middle in feature_missing_middle :
+            feat_miss_middle.write(Features[middle].annot)
+            feat_miss_middle.write("\n")
+        feat_miss_middle.close()
+        feat_miss_last=open('features_missing_last.txt','w')
+        feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
+        for last in feature_missing_last :
+            feat_miss_last.write(Features[last].annot)
+            feat_miss_last.write("\n")
+        feat_miss_last.close()
+        feat_miss=open('features_missing.txt','w')
+        feat_miss.write("\nfeatures missing entirely on azucena :\n")
+        for entier in feature_missing_all :
+            feat_miss.write(Features[entier].annot)
+            feat_miss.write("\n")
+        feat_miss.close()
+        feat_miss_total=open('features_missing_total.txt','w')
+        feat_miss_total.write("\nfeatures missing a segment on azucena :\n")
+        for total in feature_missing_total :
+            feat_miss_total.write(Features[total].annot)
+            feat_miss_total.write("\n")
+        feat_miss_total.close()
+        feat_all=open('all_features.txt','w')
+        for feat in list_feature:
+            feat_all.write(Features[feat].annot)
+            feat_all.write("\n")
+        feat_all.close()
\ No newline at end of file
--- a/main.py
+++ b/main.py
+import Functions as fct
+intersect_path='../wd/data/intersect_all_chr10.bed'
+fct.create_seg_feat(intersect_path)
+output_file='graphe_chr10_all.gff'
+fct.graph_gff(output_file)
+pos_seg="pos_seg_azu.bed"
+gff="graphe_chr10_all.gff"
+out="azucena_chr10_all.gff"
+fct.azu(pos_seg,gff,out)