You need to sign in or sign up before continuing.
Newer
Older

nina.marthe_ird.fr
committed
from .usefull_little_functions import get_seg_occ
from .intersect import invert_seg, Features
from .detect_inversion import detect_feature_inversion,invert_segment_list
from .detect_copies import find_gene_copies,get_first_last_seg
# functions to output the stats on the transfer
# stats about missing segments and feature type, not used, will change.

nina.marthe_ird.fr
committed
def stats_feature_missing_segment(feature_missing_segments,first_seg,last_seg,list_seg,feature_id,walk,segments_on_target_genome):
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
feature_missing_segments[5].append(feature_id)
if first_seg=='' : # no segment of the feature is in the genome, the feature is missing entirely
feature_missing_segments[3].append(feature_id)
elif first_seg != list_seg[0]: # the first segment is missing
feature_missing_segments[0].append(feature_id)
elif last_seg!=list_seg[-1]: # the last segment is missing
feature_missing_segments[2].append(feature_id)
# go through all the segments, check if some are missing in the middle of the feature
elif (len(list_seg)!=1) and (feature_id not in feature_missing_segments[3]): # to access the second to last element
for segment in list_seg[1-(len(list_seg)-2)]:
if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
feature_missing_segments[1].append(feature_id)
break
# go through the segments, to see if one is missing anywhere on the feature
for segment in list_seg:
if (segment not in segments_on_target_genome) or (walk not in segments_on_target_genome[segment]):
if feature_id not in feature_missing_segments[4]:
feature_missing_segments[4].append(feature_id)
break
# if the feature doesnt have a missing segment, it is complete. ADD THE PATH CHECK FOR INSERTIONS !!
if feature_id not in feature_missing_segments[4]:
feature_missing_segments[6].append(feature_id)
def get_annot_features(list_features):
list_annot_features=[]
for feature in list_features:
list_annot_features.append(Features[feature].note)
return list_annot_features
def count_hypput_total(list_annot_first):
total=len(list_annot_first)
count_hypput=0
for annot in list_annot_first:
if ("hypothetical" in annot) or ("putative" in annot):
count_hypput+=1
return [count_hypput,total]
# print stats on the transfer : number of feature that have segments in different positions missing.
def stats_features(feature_missing_segments):
# [feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,feature_total,feature_ok]
list_annot_first=get_annot_features(feature_missing_segments[0])
[hyp_put,total]=count_hypput_total(list_annot_first)
print("\nthe first segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_middle=get_annot_features(feature_missing_segments[1])
[hyp_put,total]=count_hypput_total(list_annot_middle)
print("a middle segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_last=get_annot_features(feature_missing_segments[2])
[hyp_put,total]=count_hypput_total(list_annot_last)
print("the last segment is missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_all=get_annot_features(feature_missing_segments[3])
[hyp_put,total]=count_hypput_total(list_annot_all)
print(total,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_total=get_annot_features(feature_missing_segments[4])
[hyp_put,total]=count_hypput_total(list_annot_total)
print("there is at least one segment missing for", total,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_ok=get_annot_features(feature_missing_segments[6])
[hyp_put,total]=count_hypput_total(list_annot_ok)
print(total ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
list_annot_features=get_annot_features(feature_missing_segments[5])
[hyp_put,total]=count_hypput_total(list_annot_features)
print("there is", total,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.")
# functions to generate the different gffs
# add the paths of the feature on the target genome in the object Feature

nina.marthe_ird.fr
committed
def add_target_genome_paths(feature_id,target_genome_paths,segments_on_target_genome):
feature=Features[feature_id]
list_seg=feature.segments_list_source

nina.marthe_ird.fr
committed
list_first_last_segs=get_first_last_seg(list_seg,segments_on_target_genome)
for match in list_first_last_segs: # for each walk that has the feature
[first_seg,last_seg,walk_name]=match
# get the first and last segments of all the copies on this walk

nina.marthe_ird.fr
committed
first_last_segs_list=find_gene_copies(list_seg,walk_name,feature_id,segments_on_target_genome)
copy_number=0
for first_seg,last_seg in first_last_segs_list: # get the feature path for all the copies
copy_number+=1
copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg

nina.marthe_ird.fr
committed
feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id,segments_on_target_genome)
feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature
feature.segments_list_target.append(feature_path)

nina.marthe_ird.fr
committed
# add the feat_copy to all the segments
# get the pos start of the first and last segments (existing because done in find_gene_copies)
# then look for segs that are between.

nina.marthe_ird.fr
committed
walk_start=get_seg_occ(feature_target_path[0],walk_name,feature_id,copy_id,segments_on_target_genome)[1]
walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id,segments_on_target_genome)[2]
feat_copy=(feature_id,copy_id)
for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies
for segment in segments_on_target_genome[seg][walk_name]: # find the right occurence
if walk_start <= segment[1] <= walk_stop:
segment.append(feat_copy) # annotate the segment : feature_id, copy_id
break
# add the paths of the gene's child features

nina.marthe_ird.fr
committed
add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome)
if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ?
feature.segments_list_target.append(['','',[]])

nina.marthe_ird.fr
committed
def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths,segments_on_target_genome):
childs_list=Features[feature_id].get_child_list(Features)
for child_id in childs_list:
child=Features[child_id]
# find child path in parent's path
[child_first_seg,child_last_seg]=['','']
for seg in child.segments_list_source:
if seg in feature_target_path: # parent path
child_first_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_first_seg=invert_seg(seg)
break
if child_first_seg!='': # the child feature may be absent in this occurence of the parent
for seg in reversed(child.segments_list_source):
if seg in feature_target_path: # parent path
child_last_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_last_seg=invert_seg(seg)
break
# get the path of the child feature

nina.marthe_ird.fr
committed
child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id,segments_on_target_genome)
feat_copy=(child_id,copy_id)
feature_path=[walk_name,copy_id]
feature_path.append(child_path)
child.segments_list_target.append(feature_path)
for segment_id in child_path: # annotate the segments with info about the occurence of the child feature

nina.marthe_ird.fr
committed
segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id,segments_on_target_genome)
segment.append(feat_copy)
# find the feature's path in target genome walk

nina.marthe_ird.fr
committed
def get_feature_path(target_genome_path,first_seg,last_seg,walk_name,copy_id,feature_id,segments_on_target_genome):
# look for first_seg and last_seg that has the right copy_id for this feature

nina.marthe_ird.fr
committed
first_seg_index=get_seg_occ(first_seg,walk_name,feature_id,copy_id,segments_on_target_genome)[4]
last_seg_index=get_seg_occ(last_seg,walk_name,feature_id,copy_id,segments_on_target_genome)[4]
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
first_index=min(first_seg_index,last_seg_index)
last_index=max(first_seg_index,last_seg_index)
feature_path_target_genome=target_genome_path[first_index:last_index+1]
return feature_path_target_genome
# get the coverage and sequence id of a feature
def get_id_cov(feature_id,seg_size,target_list): # seg_size has unoriented segments : s25
feature=Features[feature_id]
source_list=feature.segments_list_source
inversion=detect_feature_inversion(source_list,target_list)
if inversion:
target_list=invert_segment_list(target_list)
[match,subs,inser,delet]=[0,0,0,0]
[i,j]=[0,0]
first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
last=False # same for the last part of the feature # for id and ins.
while (i<len(source_list)) and (j<len(target_list)):
if i==len(source_list)-1:
last=True
if source_list[i] != target_list[j]: # if there is a difference between the two paths
if target_list[j] not in source_list: # if the segment in target genome is absent in source genome
if source_list[i] not in target_list: # if the segment in source genome is absent is target genome : substitution
add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1;j+=1
else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
add=segment_id_cov("insertion",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
j+=1
elif source_list[i] not in target_list: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
add=segment_id_cov("deletion",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet
add=segment_id_cov("substitution",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1;j+=1
else: # segment present in both, no variation.
add=segment_id_cov("identity",seg_size,source_list[i],target_list[j],first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]
i+=1;j+=1
if i<=len(source_list)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
add=segment_id_cov("end_deletion",seg_size,source_list[i:],'',first,feature,last)
match+=add[0];subs+=add[1];inser+=add[2];delet+=add[3];first=add[4]

nina.marthe_ird.fr
committed
denom_cov=match+subs+delet
denom_id=denom_cov+inser
[cov,id]=[0,0]
if denom_cov>0:
cov=round((match+subs)/(match+subs+delet),3)
if denom_id>0:
id=round(match/(match+subs+inser+delet),3)
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
return [cov,id]
# computes the cov/id calculation for a segment pair
def segment_id_cov(type,seg_size,seg_a,seg_b,first,feature,last):
[match,subs,inser,delet]=[0,0,0,0]
match type:
case "identity":
if first:
match+=seg_size[seg_a[1:]]-feature.pos_start+1
elif last:
match+=feature.pos_stop
else:
match+=seg_size[seg_a[1:]]
case "substitution":
if seg_size[seg_b[1:]]!=seg_size[seg_a[1:]]: # substitution can be between segments of different size
if seg_size[seg_b[1:]]>seg_size[seg_a[1:]]:
subs+=seg_size[seg_a[1:]]
inser+=seg_size[seg_b[1:]]-seg_size[seg_a[1:]]
elif seg_size[seg_b[1:]]<seg_size[seg_a[1:]]:
subs+=seg_size[seg_b[1:]]
delet+=seg_size[seg_a[1:]]-seg_size[seg_b[1:]]
else:
subs+=seg_size[seg_a[1:]]
case "insertion":
inser+=seg_size[seg_b[1:]]
case "deletion":
if first:
delet+=seg_size[seg_a[1:]]-feature.pos_start+1
else:
delet+=seg_size[seg_a[1:]]
case "end_deletion":
for seg in seg_a[:-1]:
delet+=seg_size[seg[1:]]
delet+=feature.pos_stop
return [match,subs,inser,delet,False] # check the orientation of the segment later