You need to sign in or sign up before continuing.
Newer
Older

nina.marthe_ird.fr
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from .intersect import Features
from .usefull_little_functions import get_segment_sequence
from .detect_inversion import detect_feature_inversion,invert_segment_list
# functions to get the alignment for the transfered genes
# get the alignment between the features on the source genome and the features on the target genome
def print_alignment(feat,match,seg_seq,file_out_aln):
if match[0]!='':
feature_target_path=match[2]
if len(feature_target_path)>0: # if the feature is not completely absent
feature_path_source_genome=Features[feat].segments_list_source
inversion=detect_feature_inversion(feature_path_source_genome,feature_target_path)
if inversion:
feature_target_path=invert_segment_list(feature_target_path)
line_aln=create_line_aln(feature_path_source_genome,feature_target_path,seg_seq,feat)
file_out_aln.write(line_aln)
# creates an alignment for two segments
def segment_aln(type,seg_seq,seg_a,seg_b,first,feature_id,last):
match type:
case "identity":
if first:
feature=Features[feature_id]
seq_aln=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
elif last:
feature=Features[feature_id]
seq_aln=get_segment_sequence(seg_seq,seg_a)[:feature.pos_stop]
else:
seq_aln=get_segment_sequence(seg_seq,seg_a)
line_a=seq_aln
line_b=seq_aln
len_aln=len(seq_aln)
line_c=len_aln*"*"
case "substitution":
seq_aln_a=get_segment_sequence(seg_seq,seg_a)
seq_aln_b=get_segment_sequence(seg_seq,seg_b)
len_a=len(seq_aln_a)
len_b=len(seq_aln_b)
if len_a>len_b:
diff_len=len_a-len_b
line_a=seq_aln_a
line_b=seq_aln_b+diff_len*"-"
line_c=len_a*" "
else:
diff_len=len_b-len_a
line_a=seq_aln_a+diff_len*"-"
line_b=seq_aln_b
line_c=len_b*" "
case "insertion":
seq_aln_b=get_segment_sequence(seg_seq,seg_b)
len_b=len(seq_aln_b)
line_a=len_b*"-"
line_b=seq_aln_b
line_c=len_b*" "
case "deletion":
if first:
feature=Features[feature_id]
seq_aln_a=get_segment_sequence(seg_seq,seg_a)[feature.pos_start-1:]
else:
seq_aln_a=get_segment_sequence(seg_seq,seg_a)
len_a=len(seq_aln_a)
line_a=seq_aln_a
line_b=len_a*"-"
line_c=len_a*" "
case "end_deletion":
seq_aln_a=""
for segment in seg_a[:-1]:
seq_aln_a+=get_segment_sequence(seg_seq,segment)
feature=Features[feature_id]
seq_aln_a+=get_segment_sequence(seg_seq,seg_a[-1])[0:feature.pos_stop] # for the last segment, only take the part that the feature is on
len_a=len(seq_aln_a)
line_a=seq_aln_a
line_b=len_a*"-"
line_c=len_a*" "
return [line_a,line_b,line_c,False] # check the orientation of the segment later
# formats the alignment lines
def parse_aln_lines(line_a,line_b,line_c,feature_id):
if (len(line_a)!=len(line_b)) or (len(line_b)!=len(line_c)):
print("line lengths differ in alignment")
len_to_parse=len(line_a)
len_parsed=0
aln_line=""
nb_res_a=0
nb_res_b=0
while len_parsed<len_to_parse:
len_header=len(feature_id)+11
headers=[feature_id+"_source ",feature_id+"_target ",len_header*" "]
add_a=line_a[len_parsed:len_parsed+60]
add_b=line_b[len_parsed:len_parsed+60]
add_c=line_c[len_parsed:len_parsed+60]
nb_res_a+=len(add_a)-add_a.count("-")
nb_res_b+=len(add_b)-add_b.count("-")
aln_line+=f'{headers[0]}{add_a} {nb_res_a}\n'
aln_line+=f'{headers[1]}{add_b} {nb_res_b}\n'
aln_line+=f'{headers[2]}{add_c}\n\n'
len_parsed+=60
aln_line+="\n"
return aln_line
# creates the alignment for a feature
def create_line_aln(feature_path_source_genome,feature_path_target_genome,seg_seq,feature_id):
line_a=""
line_b=""
line_c=""
[i,j]=[0,0]
first=True # when writing the first part of the feature, dont take the whole segment, only the part that the feature is on
last=False # same for the last part of the feature
while (i<len(feature_path_source_genome)) and (j<len(feature_path_target_genome)):
if i==len(feature_path_source_genome)-1:
last=True
if feature_path_source_genome[i] != feature_path_target_genome[j]: # if there is a difference between the two paths
if feature_path_target_genome[j] not in feature_path_source_genome: # if the segment in target genome is absent in source genome
if feature_path_source_genome[i] not in feature_path_target_genome: # if the segment in source genome is absent is target genome : substitution
[add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1;j+=1
else: # target genome segment not in source_genome, but source_genome segment in target genome : insertion
[add_a,add_b,add_c,first]=segment_aln("insertion",seg_seq,"",feature_path_target_genome[j],first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
j+=1
elif feature_path_source_genome[i] not in feature_path_target_genome: # source_genome segment not in target genome, but target genome segment in source_genome : deletion
[add_a,add_b,add_c,first]=segment_aln("deletion",seg_seq,feature_path_source_genome[i],"",first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1
else : # if both segments are present in the other genome but not at the same position. weird case never found yet
[add_a,add_b,add_c,first]=segment_aln("substitution",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1;j+=1
else: # segment present in both, no variation.
[add_a,add_b,add_c,first]=segment_aln("identity",seg_seq,feature_path_source_genome[i],feature_path_target_genome[j],first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
i+=1;j+=1
if i<=len(feature_path_source_genome)-1: # if we didn't reach the length of the segment list for the first genome, the end is missing for the second genome
[add_a,add_b,add_c,first]=segment_aln("end_deletion",seg_seq,feature_path_source_genome[i:],"",first,feature_id,last)
line_a+=add_a;line_b+=add_b;line_c+=add_c
return parse_aln_lines(line_a,line_b,line_c,feature_id)