commencé à réorganiser le code plus proprement.changé l'output du détail des...

commencé à réorganiser le code plus proprement.changé l'output du détail des variations, corrigé quelques erreurs de calculs de coordonnées

commencé à réorganiser le code plus proprement.changé l'output du détail des...
2bb28bd9 · NMarthe · 3b429c92 · 2bb28bd9 · 2bb28bd9 · 2bb28bd9
Commit 2bb28bd9 authored 1 year ago by NMarthe
--- a/ClassSegFeat.py
+++ b/ClassSegFeat.py
 class Segment:
    def __init__(self,id,feature,chr,start,stop):
        self.id=id
-        self.features=feature # list of the features on this segment. if thre feature is on the + strand, it will be named '+feature_id', else '-feature_id'
+        self.features=feature # list of the features on this segment. if the feature is on the + strand, it will be named '+feature_id', else '-feature_id'

        # position on the original genome
        self.chr=chr
@@ -31,13 +31,13 @@ class Feature:
        
        self.size=int(stop)-int(start)+1

-        self.pos_start=0 # position sur le premier segment
-        self.pos_stop=0 # position sur le dernier segment
+        self.pos_start=0 # position on the first segment
+        self.pos_stop=0 # position on the last segment
        
        self.annot=annot
-        self.childs=childs # liste of childs features (exons, cds, etc)
+        self.childs=childs # liste of child features (exons, cds, etc)
        self.parent=parent
-        self.segments_list=seg # list of oriented segment on which the feature is (-s1/+s1, depending on the strand)
+        self.segments_list=seg # list of oriented segments on which the feature is (-s1/+s1, depending on the strand)

        self.note=""


--- a/Functions.py
+++ b/Functions.py
--- a/Graph_gff.py
+++ b/Graph_gff.py
+import ClassSegFeat as Class
+
+# create global variables to manipulate them in the functions below and to pass them to Functions.py
+global Features
+global Segments
+Segments={}
+Features={}
+
+
+
+# functions to create the segments and the features
+
+# create a segment with its first feature
+def init_seg(line,feature):
+    line=line.split()
+    seg_id=line[3][1:]
+    chr=line[0]
+    start=line[1]
+    stop=line[2]
+    
+    # add the current feature to the list of features that are on the segment
+    feature_strand=line[10]
+    feature_stranded=feature_strand+feature
+    feat=list()
+    feat.append(feature_stranded)
+    
+    # create the segment
+    Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
+
+
+# create a feature with the first segment its on
+def init_feature(line):
+    line=line.split()
+    feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
+    type=line[6]
+    annot=line[12]
+    chr=line[4]
+    start=line[7]
+    stop=line[8]
+    childs=list()
+    
+    # add the current segment to the list of segments that have the feature
+    seg=line[3][1:]
+    strand=line[10]
+    seg_stranded=strand+seg
+    segments_list=list()
+    segments_list.append(seg_stranded)
+
+    parent=get_parent(annot,feature_id)
+
+    # create the feature
+    Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
+
+
+# if there is a parent, get the id of the parent feature and add the child to the parent
+def get_parent(annot,feature_id):
+    # if the current feature has a parent, add the current feature in the childs list of its parent
+    if annot.split(";")[1].split("=")[0]=="Parent":
+    # for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
+        parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
+        add_child(parent,feature_id)
+    
+    elif annot.split(";")[2].split("=")[0]=="Parent":
+    # for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
+        parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
+        add_child(parent,feature_id)
+    else: parent=""
+
+    return parent
+
+
+# add a feature to an existing segment
+def add_feature(seg,new_feature,strand):
+    new_feature_stranded=strand+new_feature
+    if new_feature_stranded not in Segments[seg].features:
+        Segments[seg].features.append(new_feature_stranded)
+    
+
+# add a child feature to an existing feature
+def add_child(feat,new_child):
+    if feat in Features.keys(): # if the parent feature exists
+        if new_child not in Features[feat].childs:
+            Features[feat].childs.append(new_child)
+    
+    
+# add a segment to an existing feature
+def add_seg(feat,new_seg,strand):
+    seg_stranded=strand+new_seg
+    if seg_stranded not in Features[feat].segments_list:
+        Features[feat].segments_list.append(seg_stranded)
+
+
+# create a note for the child features that do not have annotation. 
+def set_note(id):
+    # the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
+    # in the gff, the notes are only on the "gene" features. it's easier to have it for the childs than to check the parent's note (or the parent's parent). 
+    feat=Features[id]
+    if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
+        feat.note=feat.annot.split(';')[-1]
+    else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
+        # we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
+        # this is because for example the parent of an exon is the mrna, not the gene itself, so we need to go up until we find the gene.
+        curent=feat.parent
+        annot_found=False
+        while annot_found==False:
+            if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
+                note=Features[curent].annot.split(';')[-1]
+                feat.note=note
+                annot_found=True
+            else: # if we didn't find the gene, we go up to the current feature's parent until we find it
+                curent=Features[Features[curent].parent].id
+
+
+# create all the Segment and Feature objects in the dictionnaries Segments and Features
+def create_seg_feat(intersect_path):    
+    
+    # open the file with the intersect between the segments and the gff
+    file = open(intersect_path, 'r')
+    lines=file.readlines()
+    file.close()
+    
+    for line in lines: # for each line in the intersect file
+        # get the ids for the dictionnaries' keys
+        feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
+        segment_id=line.split()[3][1:]
+    
+        if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
+            init_feature(line)
+        else: # if it exists, add the current segment to the list of segments that have the existing feature
+            strand=line.split()[10]
+            add_seg(feature_id,segment_id,strand)
+    
+        if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
+            init_seg(line, feature_id)
+        
+        else: # if it exists, add the current feature to the list of features on the existing segment
+            strand=line.split()[10]
+            add_feature(segment_id,feature_id,strand)
+
+    # for all the features, add the note (information on the function of the feature), and the positions on the first and last seg.
+    # cant always do it before because for that i need to have all the parents in the dict Features, and all the segments in the list segments_list for each feature.
+    for feat_id in Features:
+        feat=Features[feat_id]
+        set_note(feat.id)
+        feat.pos_start=get_start(feat.segments_list[0][1:],feat.id)
+        feat.pos_stop=get_stop(feat.segments_list[-1][1:],feat.id)
+
+
+
+
+
+
+# functions to generate the graph's gff from the segments and features created with create_seg_feat
+
+# get the feature's start position on the segment
+def get_start(seg_id,feat_id):
+    s=Segments[seg_id]
+    f=Features[feat_id]
+    if s.start>=f.start:
+        result=1
+    else:
+        result=f.start-s.start+1
+    return result
+
+
+# get the feature's stop position on the segment
+def get_stop(seg_id,feat_id):
+    s=Segments[seg_id]
+    f=Features[feat_id]
+    if s.stop<=f.stop:
+        result=s.size
+    else:
+        result=f.stop-s.start+1
+    return result
+
+
+# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
+def graph_gff(file_out):
+    print("generation of the graph's gff")
+    file_out = open(file_out, 'w')
+    string_out=""
+    cpt_out=0
+
+    for segment in Segments:
+        # get the list of the features on the segment
+        features_seg=Segments[segment].features
+        
+        # go through all the segment's features, and print the gff line for each
+        for feature_stranded in features_seg:
+            strand=feature_stranded[0:1]
+            feature=feature_stranded[1:]
+            # feature[feature]
+
+            #[strand,feature]=[feature_stranded[0:1],feature_stranded[1:]]
+            segment_stranded=strand+segment
+            type=Features[feature].type
+            start=get_start(segment,feature)
+            stop=get_stop(segment,feature)
+            
+            # get the rank and the total number of ocurrences for the feature
+            rank=str(Features[feature].segments_list.index(segment_stranded)+1)
+            total=str(len(Features[feature].segments_list))
+
+            # create the annotation with the rank information
+            annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
+
+            # write the gff line in the output file
+            line=segment+"\tGrAnnot\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
+            string_out+=line
+            cpt_out+=1
+            #file_out.write(line)
+
+            if cpt_out==1000:
+                file_out.write(string_out)
+                string_out=""
+                cpt_out=0
+                # faire un fct print
+    file_out.write(string_out)             
+
+    file_out.close()
+
+
+
+
+
+
--- a/main.py
+++ b/main.py
-import Functions as fct
+#!/usr/bin/python
+
+# created by Nina Marthe 2023
+# licensed under MIT
+
+from Functions import *
+from Graph_gff import *

 # creates segments and features for the intersect between the graph for chr10 and the gff of IRGSP
 intersect_path='/home/nina/annotpangenome/align_genes/intersect_segments-genes_chr10.bed'
-#intersect_path='/home/nina/annotpangenome/align_genes/intersect_reel_genes_chr10.bed'
-fct.create_seg_feat(intersect_path)
+create_seg_feat(intersect_path)

-# outputs the gff of the graph for chr10
-output_file='new_graph_chr10.gff'
-fct.graph_gff(output_file)


-gff="new_graph_chr10.gff"
+# outputs the gff of the graph for chr10
+output_file='graph_chr10.gff'
+graph_gff(output_file)

-genome = 'ac'

+genome = 'ac'
 if genome=='ac': # transfer from graph to azucena
    pos_seg="seg_coord/AzucenaRS1_chromosome10.bed"
-    #out="azucena_chr10_all.gff"
    out="azucena_chr10.gff"

 if genome=='nb': # transfer from graph to nipponbare
    out="nb_chr10_all.gff"
    pos_seg="seg_coord/IRGSP-1_0_Chr10.bed"

+gff="graph_chr10.gff"
+gfa="data/graphs/RiceGraphChr10_cactus.gfa"
+
 # outputs the gff of a genome for the chr10
-fct.genome_gff(pos_seg,gff,out)
+genome_gff(pos_seg,gff,out,gfa)