Skip to content
Snippets Groups Projects
Commit c8b1d5c1 authored by NMarthe's avatar NMarthe
Browse files

scripts python pour générer les gff du graphe et des génomes, pour l'ancienne...

scripts python pour générer les gff du graphe et des génomes, pour l'ancienne version du graphe (minigraph), issus du jupyternotebook
parent a750b3ed
Branches
No related tags found
No related merge requests found
class Segment:
def __init__(self,id,feature,chr,start,stop):
self.id=id
self.features=feature # list of the features on this segment. if thre feature is on the + strand, it will be named '+feature_id', else '-feature_id'
# position on the reference genome
self.chr=chr
self.start=int(start)
self.stop=int(stop)
self.size=self.stop-self.start+1
def __str__(self):
return f"id={self.id}, position sur nipponbare={self.chr}:{self.start}-{self.stop}, size={self.size}, features={self.features}"
class Feature:
def __init__(self,id,type,chr,start,stop,annot,childs,parent,seg):
self.id=id
self.type=type
# position on the reference genome
self.chr=chr
self.start=int(start)
self.stop=int(stop)
self.size=int(stop)-int(start)+1
self.annot=annot
self.childs=childs # liste of childs features (exons, cds, etc)
self.parent=parent
self.segments_list=seg # list of oriented segment on which the feature is (-s1/+s1, depending on the strand)
def __str__(self):
if self.parent=="":
return f"id={self.id}, type={self.type}, segments={self.segments_list}, position sur la référence={self.chr}:{self.start}-{self.stop}, childs={self.childs}, annotation={self.annot}"
else:
return f"id={self.id}, type={self.type}, segments={self.segments_list}, position sur la référence={self.chr}:{self.start}-{self.stop}, parent={self.parent}, childs={self.childs}, annotation={self.annot}"
\ No newline at end of file
import ClassSegFeat as Class
# functions to create the segments and the features
# create segment
def init_seg(line,feature):
line=line.split()
seg_id=line[3][1:]
chr=line[0]
start=line[1]
stop=line[2]
# add the current feature (stranded) to the list of features that are on the segment
feature_strand=line[9]
feature_stranded=feature_strand+feature
feat=list()
feat.append(feature_stranded)
Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
# create feature
def init_feature(line):
line=line.split()
feature_id=line[11].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
type=line[5]
annot=line[11]
chr=line[4]
start=line[6]
stop=line[7]
childs=list()
# add the current segment (stranded) to the list of segments that have the feature
seg=line[3][1:]
strand=line[9]
seg_stranded=strand+seg
segments_list=list()
segments_list.append(seg_stranded)
# if the current feature as a parent, add the feature un the childs list of its parent
if annot.split(";")[1].split("=")[0]=="Parent":
# for annotations like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
elif annot.split(";")[2].split("=")[0]=="Parent":
# for annotations like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
else: parent=""
Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
# add a feature (stranded) to an existing segment
def add_feature(seg,new_feature,strand):
new_feature_stranded=strand+new_feature
if new_feature_stranded not in Segments[seg].features:
Segments[seg].features.append(new_feature_stranded)
# add a child to an existing feature
def add_child(feat,new_child):
if feat in Features.keys():
if new_child not in Features[feat].childs:
Features[feat].childs.append(new_child)
# add a segment (stranded) to an existing feature
def add_seg(feat,new_seg,strand):
seg_stranded=strand+new_seg
if seg_stranded not in Features[feat].segments_list:
Features[feat].segments_list.append(seg_stranded)
# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):
global Features
global Segments
Segments={}
Features={}
# open the intersect between the segments and the gff
file = open(intersect_path, 'r')
lines=file.readlines()
for line in lines:
# get the ids for the dictionnaries' keys
feature_id=line.split()[11].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
segment_id=line.split()[3][1:]
if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
init_feature(line)
else: # add the current segment to the list of segments that have the existing feature
strand=line.split()[9]
add_seg(feature_id,segment_id,strand)
if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
init_seg(line, feature_id)
else: # add the current feature to the existing segment
strand=line.split()[9]
add_feature(segment_id,feature_id,strand)
file.close()
# functions to get the gff of the graph from the segments and features generated with the intersect file
# get the feature's start position on the segment
def get_start(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.start>f.start:
result=1
else:
result=f.start-s.start
return result
# get the feature's stop position on the segment
def get_stop(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.stop<f.stop:
result=s.size
else:
result=f.stop-s.start
return result
# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
print("generation of the graph's gff")
file_out = open(file_out, 'w')
for segment in Segments:
# get the list of the features on the segment
features_seg=Segments[segment].features
# go through all these features, and print the gff line for each
for feature_stranded in features_seg:
strand=feature_stranded[0:1]
feature=feature_stranded[1:]
segment_stranded=strand+segment
type=Features[feature].type
start=max(1,get_start(segment,feature))
stop=get_stop(segment,feature)
# get the rank and the total number of ocurrences for the feature
rank=str(Features[feature].segments_list.index(segment_stranded)+1)
total=str(len(Features[feature].segments_list))
# create the annotation with the rank information
annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
# generate the gff line
ligne=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
#print(ligne)
file_out.write(ligne)
file_out.close()
# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_start_2(seg_start, seg_ac, feat_id):
s_start=seg_ac[seg_start][1]
f_start=get_start(seg_start,feat_id)
start=int(s_start)+int(f_start)
return start
# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_ac, feat_id):
s_start=seg_ac[seg_stop][1]
f_stop=get_stop(seg_stop,feat_id)
stop=int(s_start)+int(f_stop)
return stop
# get the position of a piece of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,gene_start,feature,seg_ac):
# start position of the complete gene on the reference
start_gene_start=Segments[gene_start].start+get_start(gene_start,feature.id)-1
# start position of the piece of the feature on the reference
start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
# start and stop position of the feature on azucena, to get the length of the piece of the feature
start_azu=get_start_2(seg_start,seg_ac,feature.id)
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
# start position of the piece of the feature on the complete feature
start_gene=int(start_first_seg)-int(start_gene_start)+1
# stop position : start+length
stop_gene=start_gene+(stop_azu-start_azu)
position=";position="+str(start_gene)+"-"+str(stop_gene)
#position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
return position
# get the proportion of the piece of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_ac,feature):
start_azu=get_start_2(seg_start,seg_ac,feature.id) # start position of the feature on azucena
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id) # stop position of the feature on azucena
proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
#proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
return proportion
# returns the gff line to write in the output file
def write_line(list_seg,i,feat,seg_start,gene_start,seg_ac):
seg_stop=list_seg[i-1][1:]
feature=Features[feat]
chr=seg_ac[seg_start][0]
strand=list_seg[i-1][0:1]
# start and stop position of the feature on azucena
start_azu=get_start_2(seg_start,seg_ac,feature.id)
stop_azu=get_stop_2(seg_stop,seg_ac,feature.id)
proportion=get_proportion(seg_start,seg_stop,seg_ac,feature)
position=get_position(seg_start,seg_stop,gene_start,feature,seg_ac)
annotation=feature.annot+proportion+position
out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
return out_line
def azu(pos_seg, gff, out):
print("generation of azucena's gff ")
# create a dictionnary with the positions of the segments on azucena
seg_ac={}
bed=open(pos_seg,'r')
lines=bed.readlines()
for line in lines:
line=line.split()
s_id=line[3]
ch=line[0]
start=line[1]
stop=line[2]
seg_ac[s_id]=list([ch,start,stop])
bed.close()
gff=open(gff,'r')
file_out = open(out, 'w')
lines=gff.readlines()
# get the list of all the features to transfer
list_feature=list()
for line in lines:
id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
if id not in list_feature:
list_feature.append(id)
# stats on how many segments are absent in azucena, their average length, etc
stats=False
if stats==True:
seg_first=list()
seg_middle=list()
seg_last=list()
seg_total=list()
seg_ok=list()
seg_entier=list()
feature_missing_first=list()
feature_missing_middle=list()
feature_missing_last=list()
feature_missing_all=list()
feature_missing_total=list()
# for each feature, get list of the segments where it is
for feat in list_feature:
list_seg=Features[feat].segments_list
size_list=len(list_seg)
# stats on how many segments are absent in azucena, their average length, etc
if stats==True:
for segment in list_seg:
seg_len=Segments[segment[1:]].size
if segment[1:] not in seg_ac:
seg_total.append(seg_len)
if feat not in feature_missing_total:
feature_missing_total.append(feat)
if segment==list_seg[0]:
if segment==list_seg[size_list-1]:
seg_entier.append(seg_len)
if feat not in feature_missing_all:
feature_missing_all.append(feat)
else:
seg_first.append(seg_len)
if feat not in feature_missing_first:
feature_missing_first.append(feat)
elif segment==list_seg[size_list-1]:
seg_last.append(seg_len)
if feat not in feature_missing_last:
feature_missing_last.append(feat)
else:
seg_middle.append(seg_len)
if feat not in feature_missing_middle:
feature_missing_middle.append(feat)
else:
seg_ok.append(seg_len)
gene_start="empty"
seg_start="empty"
seg_stop="empty"
# loop that goes through all the segments that have the current feature
# keeps the first segment present in azucena found, and when it finds a segment absent in azucena, prints the piece of the fragment, and resets the first segment present.
# at the end of the loop, prints the last piece of the fragment.
for i in range(0,size_list):
if list_seg[i][1:] in seg_ac:
if gene_start=="empty":
gene_start=list_seg[i][1:]
if seg_start=="empty":
seg_start=list_seg[i][1:]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
else:
if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=write_line(list_seg,i,feat,seg_start,gene_start,seg_ac)
file_out.write(out_line)
seg_start="empty"
seg_stop="empty"
#else: if the current segment is not in azucena but there is no start, keep looking for a start
# print the last piece of the feature in the end
if list_seg[i][1:] in seg_ac:
out_line=write_line(list_seg,i+1,feat,seg_start,gene_start,seg_ac)
file_out.write(out_line)
file_out.close()
gff.close()
# print stats
from statistics import median, mean
if stats==True:
print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
multiple_3=0
for seg in seg_middle:
if seg%3==0:multiple_3+=1
print(multiple_3,"segments missing in the middle of a feature, of length=3k")
print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(117+5)/153,2),"% hypothetical or putative.")
print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(180+3)/263,2),"% hypothetical or putative.")
print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(115+7)/156,2),"% hypothetical or putative.")
print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(314+24)/404,2),"% hypothetical or putative.")
print("there is at least one segment missing", len(feature_missing_total) ,"features, including",round(100*(606+36)/817,2),"% hypothetical or putative.")
print("there is", len(list_feature) ,"features in total, including",round(100*(2108+171)/3510,2),"% hypothetical or putative.")
feat_miss_first=open('features_missing_first.txt','w')
feat_miss_first.write("features missing the first segment on azucena :\n")
for first in feature_missing_first :
feat_miss_first.write(Features[first].annot)
feat_miss_first.write("\n")
feat_miss_first.close()
feat_miss_middle=open('features_missing_middle.txt','w')
feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
for middle in feature_missing_middle :
feat_miss_middle.write(Features[middle].annot)
feat_miss_middle.write("\n")
feat_miss_middle.close()
feat_miss_last=open('features_missing_last.txt','w')
feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
for last in feature_missing_last :
feat_miss_last.write(Features[last].annot)
feat_miss_last.write("\n")
feat_miss_last.close()
feat_miss=open('features_missing.txt','w')
feat_miss.write("\nfeatures missing entirely on azucena :\n")
for entier in feature_missing_all :
feat_miss.write(Features[entier].annot)
feat_miss.write("\n")
feat_miss.close()
feat_miss_total=open('features_missing_total.txt','w')
feat_miss_total.write("\nfeatures missing a segment on azucena :\n")
for total in feature_missing_total :
feat_miss_total.write(Features[total].annot)
feat_miss_total.write("\n")
feat_miss_total.close()
feat_all=open('all_features.txt','w')
for feat in list_feature:
feat_all.write(Features[feat].annot)
feat_all.write("\n")
feat_all.close()
\ No newline at end of file
main.py 0 → 100644
import Functions as fct
intersect_path='../wd/data/intersect_all_chr10.bed'
fct.create_seg_feat(intersect_path)
output_file='graphe_chr10_all.gff'
fct.graph_gff(output_file)
pos_seg="pos_seg_azu.bed"
gff="graphe_chr10_all.gff"
out="azucena_chr10_all.gff"
fct.azu(pos_seg,gff,out)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment