diff --git a/getSegmentsCoordinates.py b/getSegmentsCoordinates.py index 3eccffa99fbffca4cb6e5d22119ab8676655c5ce..dbd56950924a0fb999a16861e9e2f33ed47845e0 100644 --- a/getSegmentsCoordinates.py +++ b/getSegmentsCoordinates.py @@ -84,7 +84,7 @@ for line in lines : seg_start=position seg_name='s'+path[i][1:] - seg_stop=position+segments_size[seg_name]-1 + seg_stop=position+segments_size[seg_name] if path[i][0:1]=='>': orientation='+' diff --git a/inference.py b/inference.py index 84d4d1634c428284a6564f43cfd9a0ca4d936525..3d2c050ada9f913fee4a932a43822eadbba5491a 100644 --- a/inference.py +++ b/inference.py @@ -53,10 +53,12 @@ def get_segments_positions_on_genome(pos_seg): # creates a dict with the positio def add_feature_sequence(feature,seg_seq): # add the feature's sequence in the Feature object feature_sequence="" for segment in feature.segments_list: + if (segment==feature.segments_list[0]) & (segment==feature.segments_list[-1]):# the segment is the first AND the last of the feature + feature_sequence=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:feature.pos_stop-1] if segment==feature.segments_list[0]: feature_sequence+=get_segment_sequence(seg_seq,segment)[feature.pos_start-2:] # revérifier les +/- 1 pour la position, avec de vraies données elif segment==feature.segments_list[-1]: - feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop] # revérifier les +/- 1 pour la position, avec de vraies données + feature_sequence+=get_segment_sequence(seg_seq,segment)[0:feature.pos_stop-1] # revérifier les +/- 1 pour la position, avec de vraies données else: feature_sequence+=get_segment_sequence(seg_seq,segment) feature.sequence=feature_sequence @@ -171,26 +173,31 @@ def get_sequence_on_genome(feature,segments_on_target_genome): # returns the seq if segment==cds.segments_list[0]: new_sequence+=get_segment_sequence(seg_seq,segment)[cds.pos_start-2:] elif segment==cds.segments_list[-1]: - new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop] + new_sequence+=get_segment_sequence(seg_seq,segment)[0:cds.pos_stop-1] else: new_sequence+=get_segment_sequence(seg_seq,segment) return new_sequence -def get_cds_variations(var_file): # creates dict : cds_id -> list of variations from the var file +def get_mrna_var(var_file): var=open(var_file,'r') lines=var.readlines() var.close() - - # dict cds-var - cds_var={} + + mrna_var={} for line in lines: line=line.split() - if line[1]=="CDS": + feature_type=line[1] + if feature_type=="mRNA": + mrna_id=line[0].replace('.','_').replace(':','_') + if mrna_id not in mrna_var.keys(): + mrna_var[mrna_id]={} + elif feature_type=="CDS": cds_id=line[0].replace('.','_').replace(':','_') - if cds_id not in cds_var.keys(): - cds_var[cds_id]=list() - cds_var[cds_id].append(line) - return cds_var + parent_id=line[0][0:line[0].find("cds")-1] + if cds_id not in mrna_var[parent_id]: + mrna_var[parent_id][cds_id]=list() + mrna_var[parent_id][cds_id].append(line) + return mrna_var import re def findOtherStart(cds,segments_on_target_genome): # look for another start codon, before or after the original one @@ -247,104 +254,121 @@ def print_variation_change(deleted_sequence,inserted_sequence): # print the cons [paths,seg_seq]=get_segments_sequence_and_paths(gfa) segments_on_target_genome=get_segments_positions_on_genome(pos_seg) -cds_var=get_cds_variations(var_file) +mrna_var=get_mrna_var(var_file) for feature in Features.values(): # add the sequence of all features add_feature_sequence(feature,seg_seq) -# analysing the variations for all the cds : -for cds_id in cds_var.keys(): - cds=Features[cds_id] - print("analysis of the variations in the CDS",cds_id,":\n") - frameshift=0 - for index, var in enumerate(cds_var[cds_id]): # for each variation in the current cds : - type_var=var[8] - if type_var!="no_var": # if there is a variation - posVar=[int(var[12]),int(var[13])] - sequence_target=get_sequence_on_genome(cds_id,segments_on_target_genome) - - if type_var=="insertion": - length_ref=0 - else: - length_ref=len(var[9]) - if type_var=="deletion": - length_alt=0 - else: - length_alt=len(var[10]) - - print("variation",index+1, ":") - - if posVar[0]<=3: - print("variation of the start codon, mRNA most likely wont be translated") - #findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon - break - - old_frameshift=frameshift - frameshift=(frameshift+length_ref-length_alt)%3 - if old_frameshift!=frameshift: - print("variation causing a frameshift") - # frameshift=0 -> reading frame recovered. may need to get a base before. - # frameshift=1 -> frameshift of 1 base to the right - # frameshift=2 -> frameshift of 2 bases to the right - if frameshift==0: - print("recovery of the original reading frame") - if old_frameshift==0: - print("loss of the original reading frame") - #else: - # print("variation not causing a frameshift") - - - if type_var=="insertion": - print("insertion of",var[10]) - elif type_var=="deletion": - print("deletion of",var[9]) - else: - print("substitution of",var[9],"by",var[10]) +analysis=True +if analysis==True: + # analysing the variations for all the mrna : + for mrna_id in mrna_var.keys(): + print("analysis of the variations in the mRNA",mrna_id,":\n") + leftover_ref="" + leftover_target="" + frameshift=0 + cds_var=mrna_var[mrna_id] + for index_cds,cds_id in enumerate(cds_var): + + cds=Features[cds_id] + print("\nCDS",cds_id,":") + for index_var, var in enumerate(cds_var[cds_id]): # for each variation in the current cds : + type_var=var[8] + if type_var!="no_var": # if there is a variation + posVar=[int(var[12]),int(var[13])] + sequence_target=get_sequence_on_genome(cds_id,segments_on_target_genome) + + if type_var=="insertion": + length_ref=0 + else: + length_ref=len(var[9]) + if type_var=="deletion": + length_alt=0 + else: + length_alt=len(var[10]) + + print("\nvariation",index_var+1, ":") + + if (index_cds==0) & (posVar[0]<3): + print("variation of the start codon, mRNA most likely wont be translated") + #findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon + break - len_fragment_before_del=(posVar[0])%3 - len_fragment_before_ins=(posVar[1])%3 + old_frameshift=frameshift + frameshift=(frameshift+length_ref-length_alt)%3 + if old_frameshift!=frameshift: + print("variation causing a frameshift") + # frameshift=0 -> reading frame recovered. may need to get a base before. + # frameshift=1 -> frameshift of 1 base to the right + # frameshift=2 -> frameshift of 2 bases to the right + if frameshift==0: + print("recovery of the original reading frame") + if old_frameshift==0: + print("loss of the original reading frame") + #else: + # print("variation not causing a frameshift") + + + if type_var=="insertion": + print("insertion of",var[10]) + elif type_var=="deletion": + print("deletion of",var[9]) + else: + print("substitution of",var[9],"by",var[10]) - if frameshift==0: - # print only the local change. - len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3 - len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3 - total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins] - total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del] - stop=print_variation_change(total_del,total_ins) - if stop: - break + len_fragment_before_del=(posVar[0])%3 + len_fragment_before_ins=(posVar[1])%3 - else: - # print changes from local var to next var. at the next var, we will see if the reading frame is recovered. - print("frameshift of",frameshift,"base(s) to the right.") - if index==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds. - total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:] - total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:] - stop=print_variation_change(total_total_del,total_total_ins) - if stop: - break - else: # not the last variation. translate until the next var. - nextVar=cds_var[cds_id][index+1] - posNextVar=[int(nextVar[12]),int(nextVar[13])] + if frameshift==0: + # print only the local change. + len_fragment_after_del=(3-(len_fragment_before_del+length_ref))%3 + len_fragment_after_ins=(3-(len_fragment_before_ins+length_alt))%3 + total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posVar[1]+length_alt+len_fragment_after_ins] + total_del=cds.sequence[posVar[0]-len_fragment_before_del:posVar[0]+length_ref+len_fragment_after_del] + stop=print_variation_change(total_del,total_ins) + if stop: + break - if nextVar[8]=="insertion": - length_ref_nextvar=0 - else: - length_ref_nextvar:len(nextVar[9]) - if nextVar[8]=="deletion": - length_alt_nextvar=0 else: - length_alt_nextvar=len(nextVar[10]) - - len_fragment_before_del_nextvar=(posNextVar[0])%3 - len_fragment_before_ins_nextvar=(posNextVar[1])%3 - total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar] - total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar] - stop=print_variation_change(total_total_del,total_total_ins) - if stop: - break - # possible that it prints too many variations : for ex if we have a snp on the first and the last base of a codon, - # while printing the effect of the first snp we aso use the second one. + # print changes from local var to next var. at the next var, we will see if the reading frame is recovered. + print("frameshift of",frameshift,"base(s) to the right.") + if index_var==len(cds_var[cds_id])-1: # it is the last variation. translate until the end of the cds while having full codons + total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:] + print(cds.sequence,sequence_target) + total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:] + # supprimer taille%3 des sequences ins et del. + # en fait non. il faut récupérer les séquences sur les seq des cds (sans l'intron dcp), prendre la taille sur ça et supprimer sur ça + del_leftover_len=len(total_total_del)%3 + ins_leftover_len=len(total_total_ins)%3 + if del_leftover_len!=0: + total_total_del=total_total_del[0:-del_leftover_len] + if ins_leftover_len!=0: + total_total_ins=total_total_ins[0:-ins_leftover_len] + stop=print_variation_change(total_total_del,total_total_ins) + if stop: + break + else: # not the last variation. translate until the next var. + nextVar=cds_var[cds_id][index_var+1] + posNextVar=[int(nextVar[12]),int(nextVar[13])] + + if nextVar[8]=="insertion": + length_ref_nextvar=0 + else: + length_ref_nextvar:len(nextVar[9]) + if nextVar[8]=="deletion": + length_alt_nextvar=0 + else: + length_alt_nextvar=len(nextVar[10]) + + len_fragment_before_del_nextvar=(posNextVar[0])%3 + len_fragment_before_ins_nextvar=(posNextVar[1])%3 + total_total_del=cds.sequence[posVar[0]-len_fragment_before_del:posNextVar[0]-len_fragment_before_del_nextvar] + total_total_ins=sequence_target[posVar[1]-len_fragment_before_ins:posNextVar[1]-len_fragment_before_ins_nextvar] + stop=print_variation_change(total_total_del,total_total_ins) + if stop: + break + + if index_var==len(cds_var[cds_id])-1: # it is the last variation, get the end of the cds + leftover_ref=cds.sequence[len(cds.sequence)-3:] + leftover_target=sequence_target[len(sequence_target)-3:] - print("\n") -