Newer
Older

nina.marthe_ird.fr
committed
from .intersect import invert_seg
def find_gene_copies(list_seg_source,walk_name,feature_id,segments_on_target_genome):
# find all copies of all segments from the gene in the target genome (in both orientations)
[list_seg_target,list_seg_source_unstranded]=find_all_seg(list_seg_source,walk_name,segments_on_target_genome)
# find each copy of the gene in the ordered list of segments
[first_segs_list,last_segs_list]=detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id,segments_on_target_genome)
# join the first and last seg lists
first_last_segs_list=[]
index=0
for first_seg in first_segs_list:
last_seg=last_segs_list[index]
pair=(first_seg,last_seg)
first_last_segs_list.append(pair)
index+=1
return first_last_segs_list # return a list of pairs (first_seg,last_seg)

nina.marthe_ird.fr
committed
# find all the copies of the segments from the source list in the target genome
def find_all_seg(list_seg_source,walk_name,segments_on_target_genome):
index=0
list_seg_target=[] # contains list of info for each seg : [seg_id,seg_strand,start_pos,index_on_source_walk]
list_seg_source_unstranded=[] # contains the path in the source genome, but with the strand separated from the segment_id : [s24,>]
for seg in list_seg_source:
list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>]
seg_inverted=invert_seg(seg)
# look for all the segment copies in the target genome walk, in both orientations

nina.marthe_ird.fr
committed
if (seg in segments_on_target_genome) and (segments_on_target_genome[seg].find_filename(walk_name)): # if the segment is in the target genome on the right walk (chr,ctg)
for copy in segments_on_target_genome[seg].get_all_seg(walk_name): # take all its copies

nina.marthe_ird.fr
committed
seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4]
list_seg_target.append(seg_info)

nina.marthe_ird.fr
committed
if (seg_inverted in segments_on_target_genome) and (segments_on_target_genome[seg_inverted].find_filename(walk_name)) : # same but with the segment inverted
for copy in segments_on_target_genome[seg_inverted].get_all_seg(walk_name):

nina.marthe_ird.fr
committed
seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index]
list_seg_target.append(seg_info)
index+=1
list_seg_target.sort(key=sort_seg_info) # order the list of segments by start position
return [list_seg_target,list_seg_source_unstranded]
# called by find_gene_copies
def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id,segments_on_target_genome):
# prepare the variables for the first iteration of the for loop
[old_id,old_strand,old_start,old_index]=list_seg_target[0]
[first_segs_list,last_segs_list]=[[],[]]
old_seg_id=old_strand+old_id
first_segs_list.append(old_seg_id)
copy_number=1
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)

nina.marthe_ird.fr
committed
for segment in segments_on_target_genome[old_seg_id].get_all_seg(walk_name): # look for the first seg to add the occurence info

nina.marthe_ird.fr
committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
if segment[1]==old_start:
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
segment.append(feat_copy)
break
# adjust old_index for the first iteration of the loop
first_inversion=(old_strand!=list_seg_source_unstranded[old_index][1])
if first_inversion:
old_index+=1
else:
old_index-=1
for seg in list_seg_target: # find and annotate (with feat_copy) all the first and last segments of the feature copies
[new_id,new_strand,new_start,new_index]=seg
new_seg_id=new_strand+new_id
inversion=(seg[1]!=list_seg_source_unstranded[new_index][1]) # inversion if this segment's strand is not the same as in the source walk
if inversion :
if not((old_strand==new_strand) and (old_index>new_index)): # not (if the index decreases and the strand stays the same, it is the same gene copy)
last_segs_list.append(old_seg_id)
add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy,segments_on_target_genome) # add info for the last seg of the previous copy
copy_number+=1
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id) # new feature copy, change the info
first_segs_list.append(new_seg_id)
add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy,segments_on_target_genome) # add info for the first seg of the new copy
else:
if not((old_strand==new_strand) and (old_index<new_index)): # not (if the index increases and the strand stays the same, it is the same gene copy)
last_segs_list.append(old_seg_id)
add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy,segments_on_target_genome) # add info for the last seg of the previous copy
copy_number+=1
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id) # new feature copy, change the info
first_segs_list.append(new_seg_id)
add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy,segments_on_target_genome) # add info for the first seg of the new copy
[old_strand,old_index,old_seg_id,old_start]=[new_strand,new_index,new_seg_id,new_start]
# add the last seg info
last_segs_list.append(old_seg_id)
add_occurence_info_seg(old_seg_id,walk_name,new_start,feat_copy,segments_on_target_genome) # add info for the last seg of the last copy
return [first_segs_list,last_segs_list]

nina.marthe_ird.fr
committed
def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy,segments_on_target_genome):

nina.marthe_ird.fr
committed
for segment in segments_on_target_genome[target_seg_id].get_all_seg(walk_name): # look for the right occurence of the segment

nina.marthe_ird.fr
committed
if segment[1]==target_start:
segment.append(feat_copy) # add seg info
break
def sort_seg_info(seg_info):
return seg_info[2]

nina.marthe_ird.fr
committed
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# get the first and last segment of the list that is in the target genome (possibly several pairs)
def get_first_last_seg(list_seg,segments_on_target_genome):
list_first_last_segs=[]
[first_seg_found,last_seg_found,walk_found]=['','','']
list_walks=get_walks_feature_cross(list_seg,segments_on_target_genome) # get all the walks where there is a segment of the feature
for walk in list_walks: # find the first and last seg for each walk
for segment in list_seg: # look for first_seg
seg_found=search_seg_on_walk(segment,walk,segments_on_target_genome)
if seg_found:
first_seg_found=seg_found
break
if first_seg_found!='': # look for last_seg
for segment in reversed(list_seg):
last_seg_found=search_seg_on_walk(segment,walk,segments_on_target_genome)
if last_seg_found:
walk_found=walk
break
list_first_last_segs.append([first_seg_found,last_seg_found,walk_found])
[first_seg_found,last_seg_found,walk_found]=['','','']
# return all the match
return list_first_last_segs
# find all the walks that contain a segment of the feature (list_seg is the walk of the feature on the source genome)
def get_walks_feature_cross(list_seg,segments_on_target_genome):
list_walks=list()
for segment in list_seg:
seg_found=search_seg_on_target_genome(segment,segments_on_target_genome)
if seg_found: # if the segment or the reverse complement is on the target genome

nina.marthe_ird.fr
committed
for walk in segments_on_target_genome[seg_found].get_all_filenames():

nina.marthe_ird.fr
committed
if walk not in list_walks:
list_walks.append(walk)
return list_walks
# look for a segment on a walk, in either orientations
def search_seg_on_walk(segment,walk,segments_on_target_genome): # for now just print the first found, look for several later...
inverted_segment=invert_seg(segment)
if segment in segments_on_target_genome:

nina.marthe_ird.fr
committed
if segments_on_target_genome[segment].find_filename(walk):
return segment
elif inverted_segment in segments_on_target_genome:

nina.marthe_ird.fr
committed
if segments_on_target_genome[inverted_segment].find_filename(walk):
return inverted_segment
else:
return False
# look for the segment on either strand of the target genome
def search_seg_on_target_genome(segment,segments_on_target_genome):
inverted_segment=invert_seg(segment)
if segment in segments_on_target_genome:
#if inverted_segment in segments_on_target_genome:
# print(segment," found in both orientations")
return segment
elif inverted_segment in segments_on_target_genome:
#print("inverted seg found *****")
return inverted_segment
else:
return False