Skip to content
Snippets Groups Projects
Commit 83f25031 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

reorganised some functions, made them smaller, added comments

parent 0db96d54
No related branches found
No related tags found
No related merge requests found
......@@ -416,102 +416,124 @@ def add_target_genome_paths(feature_id,target_genome_paths):
list_seg=feature.segments_list_source
list_first_last_segs=get_first_last_seg(list_seg)
for match in list_first_last_segs:
for match in list_first_last_segs: # for each walk that has the feature
[first_seg,last_seg,walk_name]=match
# get the first and last segments of all the copies
[first_last_segs_list]=detect_gene_copies(list_seg,walk_name,feature_id)
# get the first and last segments of all the copies on this walk
first_last_segs_list=find_gene_copies(list_seg,walk_name,feature_id)
copy_number=0
for first_seg,last_seg in first_last_segs_list: # get the feature path for all the copies
copy_number+=1
copy_id="copy_"+str(copy_number) # get the copy that corresponds to this pair of first_seg,last_seg
feature_path=[walk_name]
feature_path.append(copy_id)
feature_target_path=get_feature_path(target_genome_paths[walk_name],first_seg,last_seg,walk_name,copy_id,feature_id)
feature_path.append(feature_target_path)
feature_path=[walk_name,copy_id,feature_target_path] # informations about the occurence of the feature
feature.segments_list_target.append(feature_path)
# add the feat_copy to all the segments !!!
# get the pos start of first and last, they are always present
# get the pos start of the first and last segments (existing because done in find_gene_copies)
# then look for segs that are between.
walk_start=get_seg_occ(feature_target_path[0],walk_name,feature_id,copy_id)[1]
walk_stop=get_seg_occ(feature_target_path[-1],walk_name,feature_id,copy_id)[2]
feat_copy=(feature_id,copy_id)
for seg in feature_target_path[1:-1]: # the first and last segs are already done in detect_gene_copies
for seg in feature_target_path[1:-1]: # the first and last segs are already done in find_gene_copies
for segment in segments_on_target_genome[seg][walk_name]: # find the right occurence
if walk_start <= segment[1] <= walk_stop:
segment.append(feat_copy)
segment.append(feat_copy) # annotate the segment : feature_id, copy_id
break
# add paths of childs
childs_list=get_child_list(feature_id)
for child_id in childs_list:
child=Features[child_id]
# add the paths of the gene's child features
add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths)
if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ?
feature.segments_list_target.append(['','',[]])
# find child path in parent's path
[child_first_seg,child_last_seg]=['','']
for seg in child.segments_list_source: # DOESNT TAKE INTO ACCOUNT THE INVERSIONS !!!
if seg in feature_target_path: # parent path
child_first_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_first_seg=invert_seg(seg)
break
def add_childs_paths(feature_id,feature_target_path,walk_name,copy_id,target_genome_paths):
childs_list=get_child_list(feature_id)
for child_id in childs_list:
child=Features[child_id]
# find child path in parent's path
[child_first_seg,child_last_seg]=['','']
for seg in child.segments_list_source:
if seg in feature_target_path: # parent path
child_first_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_first_seg=invert_seg(seg)
break
if child_first_seg!='': # the child may be absent in this occurence !
for seg in reversed(child.segments_list_source):
if seg in feature_target_path: # parent path
child_last_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_last_seg=invert_seg(seg)
break
# get the index of these segs, to get the sous-liste
child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id)
feat_copy=(child_id,copy_id)
feature_path=[walk_name,copy_id]
feature_path.append(child_path)
child.segments_list_target.append(feature_path)
for segment_id in child_path:
segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id)
segment.append(feat_copy)
if child_first_seg!='': # the child feature may be absent in this occurence of the parent
for seg in reversed(child.segments_list_source):
if seg in feature_target_path: # parent path
child_last_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_last_seg=invert_seg(seg)
break
if len(list_first_last_segs)==0: # the latter steps expect this list to not be empty. ALSO DO IT FOR THE CHILDS ?
feature.segments_list_target.append(['','',[]])
# get the path of the child feature
child_path=get_feature_path(target_genome_paths[walk_name],child_first_seg,child_last_seg,walk_name,copy_id,feature_id)
feat_copy=(child_id,copy_id)
def detect_gene_copies(list_seg_source,walk_name,feature_id):
feature_path=[walk_name,copy_id]
feature_path.append(child_path)
child.segments_list_target.append(feature_path)
# find all copies of all segments from the gene in the target genome (in both orientations)
for segment_id in child_path: # annotate the segments with info about the occurence of the child feature
segment=get_seg_occ(segment_id,walk_name,feature_id,copy_id)
segment.append(feat_copy)
# find all the copies of the segments from the source list in the target genome
def find_all_seg(list_seg_source,walk_name):
index=0
list_seg_target=[] # contains list of info for each seg [seg_id,seg_strand,start_pos,index_on_source_walk]
list_seg_source_unstranded=[]
list_seg_target=[] # contains list of info for each seg : [seg_id,seg_strand,start_pos,index_on_source_walk]
list_seg_source_unstranded=[] # contains the path in the source genome, but with the strand separated from the segment_id : [s24,>]
for seg in list_seg_source:
list_seg_source_unstranded.append([seg[1:],seg[0]]) # seg_id,seg_strand : [s24,>]
seg_inverted=invert_seg(seg)
# look for all the segment copies in the target genome walk, in both orientations
if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]):
for copy in segments_on_target_genome[seg][walk_name]:
if (seg in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg]): # if the segment is in the target genome on the right walk (chr,ctg)
for copy in segments_on_target_genome[seg][walk_name]: # take all its copies
seg_info=[seg[1:],seg[0],int(copy[1]),index] # [s24,>,584425,4]
list_seg_target.append(seg_info)
if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) :
if (seg_inverted in segments_on_target_genome) and (walk_name in segments_on_target_genome[seg_inverted]) : # same but with the segment inverted
for copy in segments_on_target_genome[seg_inverted][walk_name]:
seg_info=[seg_inverted[1:],seg_inverted[0],int(copy[1]),index]
list_seg_target.append(seg_info)
index+=1
list_seg_target.sort(key=sort_seg_info) # order the list of segments by start position
old_index=list_seg_target[0][3]
old_strand=list_seg_target[0][1]
copy_number=1
first_segs_list=[]
last_segs_list=[]
old_seg_id=list_seg_target[0][1]+list_seg_target[0][0]
return [list_seg_target,list_seg_source_unstranded]
def find_gene_copies(list_seg_source,walk_name,feature_id):
# find all copies of all segments from the gene in the target genome (in both orientations)
[list_seg_target,list_seg_source_unstranded]=find_all_seg(list_seg_source,walk_name)
# find each copy of the gene in the ordered list of segments
[first_segs_list,last_segs_list]=detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id)
# join the first and last seg lists
first_last_segs_list=[]
index=0
for first_seg in first_segs_list:
last_seg=last_segs_list[index]
pair=(first_seg,last_seg)
first_last_segs_list.append(pair)
index+=1
return first_last_segs_list # return a list of pairs (first_seg,last_seg)
# called by find_gene_copies
def detect_copies(list_seg_target,list_seg_source_unstranded,walk_name,feature_id):
# prepare the variables for the first iteration of the for loop
[old_id,old_strand,old_start,old_index]=list_seg_target[0]
[first_segs_list,last_segs_list]=[[],[]]
old_seg_id=old_strand+old_id
first_segs_list.append(old_seg_id)
old_start=list_seg_target[0][2]
for segment in segments_on_target_genome[old_seg_id][walk_name]: # add first seg info
copy_number=1
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
for segment in segments_on_target_genome[old_seg_id][walk_name]: # look for the first seg to add the occurence info
if segment[1]==old_start:
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
......@@ -523,71 +545,48 @@ def detect_gene_copies(list_seg_source,walk_name,feature_id):
old_index+=1
else:
old_index-=1
# find each copy of the gene in the ordered list of segments
for seg in list_seg_target:
new_seg_id=seg[1]+seg[0]
new_index=seg[3] # index in the list_source
new_strand=seg[1]
seg_start=seg[2]
for seg in list_seg_target: # find and annotate (with feat_copy) all the first and last segments of the feature copies
[new_id,new_strand,new_start,new_index]=seg
new_seg_id=new_strand+new_id
inversion=(seg[1]!=list_seg_source_unstranded[new_index][1]) # inversion if this segment's strand is not the same as in the source walk
if inversion :
if not((old_strand==new_strand) and (old_index>new_index)): # if the index decreases and the strand stays the same, it is the same gene copy
if not((old_strand==new_strand) and (old_index>new_index)): # not (if the index decreases and the strand stays the same, it is the same gene copy)
last_segs_list.append(old_seg_id)
for segment in segments_on_target_genome[old_seg_id][walk_name]: # add last seg info
if segment[1]==old_start:
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
segment.append(feat_copy)
break
add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) # add info for the last seg of the previous copy
copy_number+=1
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id) # new feature copy, change the info
first_segs_list.append(new_seg_id)
for segment in segments_on_target_genome[new_seg_id][walk_name]: # add first seg info
if segment[1]==seg_start:
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
segment.append(feat_copy)
break
add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) # add info for the first seg of the new copy
else:
if not((old_strand==new_strand) and (old_index<new_index)): # if the index increases and the strand stays the same, it is the same gene copy
if not((old_strand==new_strand) and (old_index<new_index)): # not (if the index increases and the strand stays the same, it is the same gene copy)
last_segs_list.append(old_seg_id)
for segment in segments_on_target_genome[old_seg_id][walk_name]: # add last seg info
if segment[1]==old_start:
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
segment.append(feat_copy)
break
add_occurence_info_seg(old_seg_id,walk_name,old_start,feat_copy) # add info for the last seg of the previous copy
copy_number+=1
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id) # new feature copy, change the info
first_segs_list.append(new_seg_id)
for segment in segments_on_target_genome[new_seg_id][walk_name]: # add first seg info
if segment[1]==seg_start:
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
segment.append(feat_copy)
break
old_strand=new_strand
old_index=new_index
old_seg_id=new_seg_id
old_start=seg_start
add_occurence_info_seg(new_seg_id,walk_name,new_start,feat_copy) # add info for the first seg of the new copy
[old_strand,old_index,old_seg_id,old_start]=[new_strand,new_index,new_seg_id,new_start]
# add the last seg info
last_segs_list.append(old_seg_id)
for segment in segments_on_target_genome[old_seg_id][walk_name]: # add last seg info
if segment[1]==seg_start:
copy_id="copy_"+str(copy_number)
feat_copy=(feature_id,copy_id)
segment.append(feat_copy)
break
add_occurence_info_seg(old_seg_id,walk_name,new_start,feat_copy) # add info for the last seg of the last copy
first_last_segs_list=[]
index=0
for first_seg in first_segs_list:
last_seg=last_segs_list[index]
pair=(first_seg,last_seg)
first_last_segs_list.append(pair)
index+=1
return [first_segs_list,last_segs_list]
return [first_last_segs_list] # return a list of pairs (first_seg,last_seg)
def add_occurence_info_seg(target_seg_id,walk_name,target_start,feat_copy):
for segment in segments_on_target_genome[target_seg_id][walk_name]: # look for the right occurence of the segment
if segment[1]==target_start:
segment.append(feat_copy) # add seg info
break
def sort_seg_info(seg_info):
return seg_info[2]
......
......@@ -25,48 +25,13 @@ def generate_target_gff(feature_id,seg_size,args,reason_features_not_transfered,
match.append(False) # store the information that this gene copy didnt pass the filters
return "no"
if right_size(size_on_new_genome,max_diff,feature_id):# or (feature.type!="gene"):
if right_size(size_on_new_genome,max_diff,feature_id):
line_gff=create_line_target_gff(first_seg,last_seg,feature_id,size_diff,inversion,walk,cov,id,copy_id) # transfer the gene
write_line(line_gff,output_target_gff,False)
# transfer all the gene's childs
# MAKE A FUNCTION FOR THAT
childs_list=get_child_list(feature_id)
for child_id in childs_list:
child=Features[child_id]
# look for the first and last seg of the child in its parent path
[child_first_seg,child_last_seg]=['','']
for seg in child.segments_list_source: # get first seg in the parent path
if seg in feature_target_path: # parent path
child_first_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_first_seg=invert_seg(seg)
break
if child_first_seg!='': # the child may be absent in this occurence !
for seg in reversed(child.segments_list_source): # get last seg in the parent path
if seg in feature_target_path: # parent path
child_last_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_last_seg=invert_seg(seg)
break
if inversion: # done twice, make a function
child_size_on_new_genome=get_feature_start_on_target_genome_inv(child_last_seg,child_id,walk,copy_id)-get_feature_stop_on_target_genome_inv(child_first_seg,child_id,walk,copy_id)+1
else:
child_size_on_new_genome=get_feature_stop_on_target_genome(child_last_seg,child_id,walk,copy_id)-get_feature_start_on_target_genome(child_first_seg,child_id,walk,copy_id)+1
child_size_diff=abs(child_size_on_new_genome-child.size)
for match in child.segments_list_target:
if (match[0]==walk) and (match[1]==copy_id):
seg_list=match[2]
[cov,id]=get_id_cov(child_id,seg_size,seg_list)
# transfer the child feature:
line_gff=create_line_target_gff(child_first_seg,child_last_seg,child_id,child_size_diff,inversion,walk,cov,id,copy_id)
write_line(line_gff,output_target_gff,False)
transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_size)
match.append(True) # store the information that this gene copy was transfered
return size_diff
......@@ -78,7 +43,44 @@ def generate_target_gff(feature_id,seg_size,args,reason_features_not_transfered,
reason_features_not_transfered[1]+=1
match.append(False) # store the information that this gene copy didnt pass the filters
return "no"
def transfer_childs(feature_id,feature_target_path,walk,copy_id,inversion,seg_size):
childs_list=get_child_list(feature_id)
for child_id in childs_list:
child=Features[child_id]
# look for the first and last seg of the child in its parent path
[child_first_seg,child_last_seg]=['','']
for seg in child.segments_list_source: # get first seg in the parent path
if seg in feature_target_path: # parent path
child_first_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_first_seg=invert_seg(seg)
break
if child_first_seg!='': # check that the child feature is not absent in this occurence of the parent feature
for seg in reversed(child.segments_list_source): # get last seg in the parent path
if seg in feature_target_path: # parent path
child_last_seg=seg
break
elif invert_seg(seg) in feature_target_path:
child_last_seg=invert_seg(seg)
break
if inversion: # calculate the child feature size
child_size_on_new_genome=get_feature_start_on_target_genome_inv(child_last_seg,child_id,walk,copy_id)-get_feature_stop_on_target_genome_inv(child_first_seg,child_id,walk,copy_id)+1
else:
child_size_on_new_genome=get_feature_stop_on_target_genome(child_last_seg,child_id,walk,copy_id)-get_feature_start_on_target_genome(child_first_seg,child_id,walk,copy_id)+1
child_size_diff=abs(child_size_on_new_genome-child.size)
for match in child.segments_list_target: # look for the right occurence of the child feature
if (match[0]==walk) and (match[1]==copy_id):
seg_list=match[2]
[cov,id]=get_id_cov(child_id,seg_size,seg_list)
# transfer the child feature:
line_gff=create_line_target_gff(child_first_seg,child_last_seg,child_id,child_size_diff,inversion,walk,cov,id,copy_id)
write_line(line_gff,output_target_gff,False)
# handles all the transfer, variation, alignment (stats?), on the target genome.
def transfer_on_target(segments_file, out_target_gff, out_var,out_aln,target_genome,target_genome_paths,list_feat_absent,seg_size,args):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment