Newer
Older
import ClassSegFeat as Class
# functions to create the segments and the features
def init_seg(line,feature):
line=line.split()
seg_id=line[3][1:]
chr=line[0]
start=line[1]
stop=line[2]
# add the current feature to the list of features that are on the segment
feature_strand=line[10]
feature_stranded=feature_strand+feature
feat=list()
feat.append(feature_stranded)
Segments[seg_id]=Class.Segment(seg_id,feat,chr,start,stop)
def init_feature(line):
line=line.split()
feature_id=line[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
type=line[6]
annot=line[12]
chr=line[4]
start=line[7]
stop=line[8]
childs=list()
# add the current segment to the list of segments that have the feature
seg=line[3][1:]
strand=line[10]
seg_stranded=strand+seg
segments_list=list()
segments_list.append(seg_stranded)
# if the current feature has a parent, add the current feature in the childs list of its parent
if annot.split(";")[1].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1:exon_7;Parent=LOC_Os01g01010.1, where the parent is in the field 1
parent=annot.split(";")[1].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
elif annot.split(";")[2].split("=")[0]=="Parent":
# for annotations that look like : ID=LOC_Os01g01010.1;Name=LOC_Os01g01010.1;Parent=LOC_Os01g01010, where the parent is in the field 2
parent=annot.split(";")[2].split("=")[1].replace(".","_").replace(":","_")
add_child(parent,feature_id)
else: parent=""
Features[feature_id]=Class.Feature(feature_id,type,chr,start,stop,annot,childs,parent,segments_list)
def add_feature(seg,new_feature,strand):
new_feature_stranded=strand+new_feature
if new_feature_stranded not in Segments[seg].features:
Segments[seg].features.append(new_feature_stranded)
def add_child(feat,new_child):
if feat in Features.keys(): # if the parent feature exists
if new_child not in Features[feat].childs:
Features[feat].childs.append(new_child)
def add_seg(feat,new_seg,strand):
seg_stranded=strand+new_seg
if seg_stranded not in Features[feat].segments_list:
Features[feat].segments_list.append(seg_stranded)
# create a note for the child features that do not have annotation.
# the note contains information on the function of the feature and is used for statistics on hypothetical/putatives features.
if feat.type=="gene": # if the feature is a gene, the note is the last field of its annotation.
feat.note=feat.annot.split(';')[-1]
else: # else, the note will be the note of the gene that contains the feature. in my gff, only the genes have an annotation.
# we go back to the parent of the feature, and its parent if necessary, etc, until we find the gene.
annot_found=False
while annot_found==False:
if Features[curent].type=="gene": # if/once we found the gene, we get its note to transfer it to the child feature
note=Features[curent].annot.split(';')[-1]
feat.note=note
annot_found=True
else: # if we didn't find the gene, we go back to the current feature's parent until we find it
curent=Features[Features[curent].parent].id
# create all the Segment and Feature objects in the dictionnaries Segments and Features
def create_seg_feat(intersect_path):
global Features
global Segments
Segments={}
Features={}
# open the file with the intersect between the segments and the gff
file = open(intersect_path, 'r')
lines=file.readlines()
for line in lines:
# get the ids for the dictionnaries' keys
feature_id=line.split()[12].split(';')[0].split("=")[1].replace(".","_").replace(":","_")
segment_id=line.split()[3][1:]
if feature_id not in Features: # if the feature doesn't exist, create it and add the current segment to its seg list
init_feature(line)
else: # if it exists, add the current segment to the list of segments that have the existing feature
strand=line.split()[10]
add_seg(feature_id,segment_id,strand)
if segment_id not in Segments: # if the segment doesn't exist, create it and add the current feature to its feat list
init_seg(line, feature_id)
else: # if it exists, add the current feature to the list of features on the existing segment
strand=line.split()[10]
add_feature(segment_id,feature_id,strand)
# for all the features, add the note (information on the function of the feature)
for feat_id in Features:
set_note(feat_id)
file.close()
# functions to generate the graph's gff from the segments and features created with create_seg_feat
# get the feature's start position on the segment
def get_start(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.start>f.start:
result=1
else:
result=f.start-s.start
return result
# get the feature's stop position on the segment
def get_stop(seg,feat):
s=Segments[seg]
f=Features[feat]
if s.stop<f.stop:
result=s.size
else:
result=f.stop-s.start
return result
# go through all the segments in Segments and prints the gff, with one line for each segment/feature intersection
def graph_gff(file_out):
print("generation of the graph's gff")
file_out = open(file_out, 'w')
for segment in Segments:
# get the list of the features on the segment
features_seg=Segments[segment].features
# go through all these features, and print the gff line for each
for feature_stranded in features_seg:
strand=feature_stranded[0:1]
feature=feature_stranded[1:]
segment_stranded=strand+segment
type=Features[feature].type
stop=get_stop(segment,feature)
# get the rank and the total number of ocurrences for the feature
rank=str(Features[feature].segments_list.index(segment_stranded)+1)
total=str(len(Features[feature].segments_list))
# create the annotation with the rank information
annotation=Features[feature].annot+";Rank_occurrence="+rank+";Total_occurrences="+total
# write the gff line in the output file
line=segment+"\tgraph_gff\t"+type+"\t"+str(start)+"\t"+str(stop)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
file_out.write(line)
file_out.close()
# functions to generate a genome's gff from the graph's gff
# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_start_2(seg_start, seg_genome, feat_id):
s_start=seg_genome[seg_start][1]
f_start=get_start(seg_start,feat_id)
start=int(s_start)+int(f_start)
return start
# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
def get_stop_2(seg_stop, seg_genome, feat_id):
s_start=seg_genome[seg_stop][1]
f_stop=get_stop(seg_stop,feat_id)
stop=int(s_start)+int(f_stop)
return stop
# get the position of a part of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,feat_start,feature,seg_genome):
# start position of the entire feature on the reference
start_gene_start=Segments[feat_start].start+get_start(feat_start,feature.id)-1
# start position of the piece of the feature on the reference
start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
# start and stop position of the feature on azucena, to get the length of the piece of the feature
start_azu=get_start_2(seg_start,seg_genome,feature.id)
stop_azu=get_stop_2(seg_stop,seg_genome,feature.id)
# start position of the piece of the feature on the complete feature
start_gene=int(start_first_seg)-int(start_gene_start)+1
# stop position : start+length
stop_gene=start_gene+(stop_azu-start_azu)
position=";position="+str(start_gene)+"-"+str(stop_gene)
#position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
return position
# get the proportion of a part of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_genome,feature):
start_azu=get_start_2(seg_start,seg_genome,feature.id) # start position of the feature on azucena
stop_azu=get_stop_2(seg_stop,seg_genome,feature.id) # stop position of the feature on azucena
proportion=";proportion="+str(stop_azu-start_azu+1)+"/"+str(feature.size)
#proportion=";number_bases="+str(stop_azu-start_azu+1)+";total_bases="+str(feature.size)
return proportion
# returns the gff line to write in the output file
def write_line(list_seg,i,feat,seg_start,feat_start,seg_genome):
seg_stop=list_seg[i-1][1:]
feature=Features[feat]
strand=list_seg[i-1][0:1]
# start and stop position of the feature on azucena
start_azu=get_start_2(seg_start,seg_genome,feature.id)
stop_azu=get_stop_2(seg_stop,seg_genome,feature.id)
proportion=get_proportion(seg_start,seg_stop,seg_genome,feature)
position=get_position(seg_start,seg_stop,feat_start,feature,seg_genome)
annotation=feature.annot+proportion+position
out_line=chr+"\tgraph_gff\t"+feature.type+"\t"+str(start_azu)+"\t"+str(stop_azu)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
return out_line
# writes the gff of azucena using the gff of the graph
# create a dictionnary with the positions of the segments on the genome to transfer on
seg_genome={}
bed=open(pos_seg,'r')
lines=bed.readlines()
for line in lines:
line=line.split()
s_id=line[3][1:]
ch=line[0]
start=line[1]
stop=line[2]
bed.close()
gff=open(gff,'r')
file_out = open(out, 'w')
file_out_alt=open("azucena_chr10_alt.gff",'w')
file_out2 = open("segments_manquants.txt", 'w')
lines=gff.readlines()
diff_list=list()
stats=True
# get the list of all the features to transfer from the gff
list_feature=list()
for line in lines:
id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
if id not in list_feature:
list_feature.append(id)
# create objects for stats on how many segments are absent in azucena, their average length, etc
if stats==True:
seg_first=list()
seg_middle=list()
seg_last=list()
seg_ok=list()
seg_entier=list()
feature_missing_first=list()
feature_missing_middle=list()
feature_missing_last=list()
feature_missing_all=list()
feature_missing_total=list()
feature_total=list()
feature_ok=list()
#size_seg_missing=list()
# for each feature, get list of the segments where it is and the first and last segment of the feature on the genome
for feat in list_feature:
list_seg=Features[feat].segments_list
size_list=len(list_seg)
first_seg="0"
for segment in list_seg:
first_seg=segment[1:]
break
last_seg="0"
for segment in reversed(list_seg):
last_seg=segment[1:]
break
# stats on how many segments are absent in azucena, their average length, etc
# for the segments
# segment missing that is at the start of a feature - seg_first
# segment missing that is at the end of a feature - seg_last
# segment missing that is in the middle of a feature - seg_middle
# segment missing that contains the entire feature - seg_entier
# total number of genes missing - seg_total_abs
# segments not missing - seg_ok
# segments missing of not - seg_total
if (stats==True):
for segment in list_seg: # counts the segments in each category
seg_len=Segments[segment[1:]].size
if segment[1:] not in seg_genome: # the segment is missing
seg_total.append(seg_len)
segments_missing.append(Segments[segment[1:]])
if segment==list_seg[0]: # the segment is the first of the feature
if segment==list_seg[-1]: # the segment is the last of the feature
seg_entier.append(seg_len) # the segment is the first and the last, so it contains the entire feature
else:
seg_first.append(seg_len)
elif segment==list_seg[-1]: # the segment is the last of the feature
seg_last.append(seg_len)
else:
seg_middle.append(seg_len)
seg_ok.append(seg_len)
# for the features :
# the fist segment of the feature is missing - feature_missing_first
# the last segment of the feature is missing - feature_missing_last
# at least one middle segment of the feature is missing - feature_missing_middle
# the entire feature is missing ->feature_missing_all
# at least one segment is missing first, last, or middle) - feature_missing_total
# no segment is missing, the feature is complete - feature_ok
# total number of features, with missing segments or not - feature_total
if (stats==True):
feature_total.append(feat)
if (first_seg=='0') & (last_seg=='0') : # no segment of the feature is in the genome, the feature is missing entirely
feature_missing_all.append(feat)
elif first_seg != list_seg[0][1:]: # the first segment is missing
feature_missing_first.append(feat)
elif last_seg!=list_seg[-1][1:]: # the last segment is missing
feature_missing_last.append(feat)
# go through all the segments, check if some are missing in the middle of the feature
elif (len(list_seg)!=1) & (feat not in feature_missing_all): # to access the second to last element
for segment in list_seg[1-(len(list_seg)-2)]:
feature_missing_middle.append(feat)
break
# go through the segments, to see if one is missing anywhere on the feature
for segment in list_seg:
if feat not in feature_missing_total:
feature_missing_total.append(feat)
break
# if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK !!
if feat not in feature_missing_total:
feature_ok.append(feat)
# outputs each fragment of each feature, with its position on the new genome and its annotation :
# loop that goes through all the segments that have the current feature
# keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the part of the fragment, and resets the first segment present.
# continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature.
# etc.
# at the end of the loop, prints the last part of the fragment.
feat_start="empty" # stocks the first segment of the feature
seg_start="empty" # stocks the first segment of the current part of the feature
#seg_stop="empty" # stocks the last segment of the current part of the feature
for i in range(0,size_list):
if list_seg[i][1:] in seg_genome: # look for a segment present in the genome
if feat_start=="empty":
feat_start=list_seg[i][1:]
if seg_start=="empty": # if we dont have a start, take the current segment for the start of the part of the feature
seg_start=list_seg[i][1:]
#else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
else:
if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
out_line=write_line(list_seg,i,feat,seg_start,feat_start,seg_genome)
file_out.write(out_line)
seg_start="empty"
#else: if the current segment is not in azucena but there is no start, keep looking for a start
# print the last piece of the feature in the end
if list_seg[i][1:] in seg_genome:
out_line=write_line(list_seg,i+1,feat,seg_start,feat_start,seg_genome)
file_out.write(out_line)
# outputs each feature once, from the first to the last segment present on the new genome and its annotation : ADD THE SIZE PARAMETER
if (first_seg!='0') & (last_seg!='0'):
strand=list_seg[i-1][0:1]
line=chr+" graph_gff "+Features[feat].type+" "+str(get_start_2(first_seg,seg_genome,feat))+" "+str(get_stop_2(last_seg,seg_genome,feat))+" . "+strand+" . "+Features[feat].annot+"\n"
file_out_alt.write(line)
# outputs the detail of the missing fragments for each feature :
# get the list of the segments in a feature that are missing in the genome
for segment in list_seg:
segments_missing.append(Segments[segment[1:]])
if (len(segments_missing)!=0) & (first_seg!='0') & (last_seg!='0'):
# get the lengths of the feature, on the original genome or on the new one
start2=get_start_2(first_seg,seg_genome,feat)
stop2=get_stop_2(last_seg,seg_genome,feat)
size_on_genome=int(stop2)-int(start2)+1
size_diff=size_on_genome-Features[feat].size
diff_list.append(int(size_diff)) # to have stats on the size of the segments missing
line=feat+" : "+str(len(segments_missing))+" variations. length difference (length in the new genome-length in the original genome) : "+str(size_diff)+"\n"
# length of the original feature : "+str(Features[feat].size)+", length of the feature on this genome : "+str(size_on_genome)+"\n"
file_out2.write(line)
# for each segment missing, see where it is on the feature (start, middle, end)
for segment in segments_missing:
if (segment.id==Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#premier et pas dernier
elif (segment.id==Features[feat].segments_list[-1][1:]) & ((segment.id!=Features[feat].segments_list[0][1:])):#dernier et pas premier
elif (segment.id!=Features[feat].segments_list[0][1:]) & (segment.id!=Features[feat].segments_list[-1][1:]):#ni premier ni dernier
line="variation in the middle of the feature, of length "+str(segment.size)+"\n"
# check if its a substitution of an indel
# if its a substitution ou a deletion, print what it is replacing in the original feature
#size_seg_missing.append(segment.size)
file_out2.write("\n")
elif len(segments_missing)!=0:
line=feat+": feature entirely absent\n\n"
file_out2.write(line)
line=feat+": no variation.\n\n"
file_out2.write(line)
#print("difference moyenne : "+str(sum(diff_list)/len(diff_list)))
#import matplotlib.pyplot as plt
#plt.hist(diff_list, range=(-1000,1000))
#plt.show()
#print("segment au millieu taille moyenne : "+str(sum(size_seg_missing)/len(size_seg_missing)))
file_out.close()
gff.close()
file_out2.close()
file_out_alt.close()
# print stats
from statistics import median, mean
if stats==True:
# prints the stats for the segments
#print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
#print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
#print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
#print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
#print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
#print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))
# creates files with the features by category (missing first, middle, end segment, etc)
feat_miss_first=open('features_missing_first.txt','w')
feat_miss_first.write("features missing the first segment on azucena :\n")
for first in feature_missing_first :
feat_miss_first.write(Features[first].annot)
if Features[first].type!='gene':
feat_miss_first.write(';')
feat_miss_first.write(Features[first].note)
feat_miss_first.write("\n")
feat_miss_first.close()
feat_miss_middle=open('features_missing_middle.txt','w')
feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
for middle in feature_missing_middle :
feat_miss_middle.write(Features[middle].annot)
if Features[middle].type!='gene':
feat_miss_middle.write(';')
feat_miss_middle.write(Features[middle].note)
feat_miss_middle.write("\n")
feat_miss_middle.close()
feat_miss_last=open('features_missing_last.txt','w')
feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
for last in feature_missing_last :
feat_miss_last.write(Features[last].annot)
if Features[last].type!='gene':
feat_miss_last.write(';')
feat_miss_last.write(Features[last].note)
feat_miss_last.write("\n")
feat_miss_last.close()
feat_miss=open('features_missing.txt','w')
feat_miss.write("\nfeatures missing entirely on azucena :\n")
for entier in feature_missing_all :
feat_miss.write(Features[entier].annot)
if Features[entier].type!='gene':
feat_miss.write(';')
feat_miss.write(Features[entier].note)
feat_miss.write("\n")
feat_miss.close()
feat_miss_total=open('features_missing_total.txt','w')
feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
for total in feature_missing_total :
feat_miss_total.write(Features[total].annot)
if Features[total].type!='gene':
feat_miss_total.write(';')
feat_miss_total.write(Features[total].note)
feat_miss_total.write("\n")
feat_miss_total.close()
feat_all=open('all_features.txt','w')
for feat in list_feature:
feat_all.write(Features[feat].annot)
if Features[feat].type!='gene':
feat_all.write(';')
feat_all.write(Features[feat].note)
feat_all.write("\n")
feat_all.close()
feat_ok=open('features_ok.txt','w')
feat_ok.write("\nfeatures entirely present in azucena :\n")
for ok in feature_ok :
feat_ok.write(Features[ok].annot)
if Features[ok].type!='gene':
feat_ok.write(';')
feat_ok.write(Features[ok].note)
feat_ok.write("\n")
feat_ok.close()
# prints the stats for the features (hypothetical/putative rate, by category)
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
total=len(feature_missing_first)
print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
total=len(feature_missing_middle)
print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
total=len(feature_missing_last)
print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
total=len(feature_missing_all)
print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
total=len(feature_missing_total)
print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
total=len(feature_ok)
print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
total=len(list_feature)
print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
subprocess.run(command,shell=True)