Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
GrAnnoT
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
DIADE
dynadiv
GrAnnoT
Commits
e243b920
Commit
e243b920
authored
1 year ago
by
nina.marthe_ird.fr
Browse files
Options
Downloads
Patches
Plain Diff
simplified the code for inference, made a function for it, attempted to handle multi exonic genes
parent
3b61d864
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
inference.py
+83
-105
83 additions, 105 deletions
inference.py
with
83 additions
and
105 deletions
inference.py
+
83
−
105
View file @
e243b920
...
...
@@ -236,6 +236,7 @@ def findOtherStart(cds,segments_on_target_genome): # look for another start codo
return
frame_shift
def
print_variation_change
(
deleted_sequence
,
inserted_sequence
):
# print the consequence of the variation represented by the insertion and deletion
print
(
"
printvar
"
,
deleted_sequence
,
inserted_sequence
)
stop
=
False
deleted_aa
=
traduction
(
get_rna
(
deleted_sequence
))
inserted_aa
=
traduction
(
get_rna
(
inserted_sequence
))
...
...
@@ -262,6 +263,66 @@ for feature in Features.values(): # add the sequence of all features
add_feature_sequence
(
feature
,
seg_seq
)
def
test
(
frameshift
,
readingframe
,
posVar
,
length_ref
,
length_alt
,
sequenceCDS_source
,
sequenceCDS_target
,
index_var
,
cds_var
,
cds_id
):
ProtChangeStartPosOnCDS_ref
=
posVar
[
0
]
-
(
posVar
[
0
]
%
3
)
ProtChangeStartPosOnCDS_alt
=
posVar
[
1
]
-
(
posVar
[
1
]
%
3
)
if
frameshift
==
0
:
# only local change
ProtChangeStopPosOnCDS_ref
=
posVar
[
0
]
+
length_ref
+
(
3
-
(
posVar
[
0
]
+
length_ref
))
%
3
# end of var + rest of the codon
ProtChangeStopPosOnCDS_alt
=
posVar
[
1
]
+
length_alt
+
(
3
-
(
posVar
[
1
]
+
length_alt
))
%
3
elif
index_var
!=
len
(
cds_var
[
cds_id
])
-
1
:
# not the last variation, go until the last variation
nextVar
=
cds_var
[
cds_id
][
index_var
+
1
]
posNextVar
=
[
int
(
nextVar
[
12
])
+
(
3
-
readingframe
)
%
3
,
int
(
nextVar
[
13
])
+
(
3
-
readingframe
)
%
3
]
ProtChangeStopPosOnCDS_ref
=
posNextVar
[
0
]
-
(
posNextVar
[
0
]
%
3
)
ProtChangeStopPosOnCDS_alt
=
posNextVar
[
1
]
-
(
posNextVar
[
1
]
%
3
)
else
:
# last variation, go until the end of the cds
ProtChangeStopPosOnCDS_ref
=
len
(
sequenceCDS_source
)
ProtChangeStopPosOnCDS_alt
=
len
(
sequenceCDS_target
)
ref_sequence
=
sequenceCDS_source
[
ProtChangeStartPosOnCDS_ref
:
ProtChangeStopPosOnCDS_ref
]
alt_sequence
=
sequenceCDS_target
[
ProtChangeStartPosOnCDS_alt
:
ProtChangeStopPosOnCDS_alt
]
if
(
frameshift
!=
0
)
&
(
index_var
==
len
(
cds_var
[
cds_id
])
-
1
)
&
(
index_cds
!=
len
(
cds_var
)
-
1
):
# if it is the last variation of the cds and it is not the last cds
# get the sequence until the next variation in the next cds
# for this, go through all the cds left, and add the sequence until the first variation encountered
var_found
=
False
keys
=
list
(
cds_var
.
keys
())
cds_rank
=
keys
.
index
(
cds_id
)
+
1
nextCds
=
keys
[
cds_rank
]
current_readingframe
=
readingframe
# # look for the next variation, and get the sequence before.
while
(
not
var_found
)
&
(
keys
.
index
(
nextCds
)
!=
len
(
keys
)):
# go through all the cds until the last
if
len
(
cds_var
[
nextCds
])
==
0
:
# if the current cds has no variation, simply add its sequence
ref_sequence
+=
Features
[
nextCds
].
sequence
alt_sequence
+=
get_sequence_on_genome
(
nextCds
,
segments_on_target_genome
)
cds_rank
+=
1
nextCds
=
keys
[
cds_rank
]
#current_readingframe=(3-(len(sequenceCDS_source)-current_readingframe))%3
print
(
"
no var in current cds
"
)
else
:
# variation found
variation
=
cds_var
[
nextCds
][
0
]
#posNextVar=[int(variation[12])+(3-current_readingframe)%3,int(variation[13])+(3-current_readingframe)%3]
# revoir le calcul des fragments avant la variation !
# cas où la var est sur la deuxième base du cds : alors il faut peut être laisser à cette var les bases avant du cds précédent ......
# et je crois qu'on a toujours pas réglé le cas où on a une sbustitution sur la base 1 et 3 d'un codon.
nextReadingframe
=
(
3
-
(
len
(
sequenceCDS_source
)
-
readingframe
))
%
3
len_codonFragment_beforeNextVar_ref
=
(
int
(
variation
[
12
])
-
nextReadingframe
)
%
3
len_codonFragment_beforeNextVar_alt
=
(
int
(
variation
[
13
])
+
(
3
-
frameshift
)
-
nextReadingframe
)
%
3
ref_sequence
+=
Features
[
nextCds
].
sequence
[
0
:
int
(
variation
[
12
])
-
len_codonFragment_beforeNextVar_ref
]
alt_sequence
+=
get_sequence_on_genome
(
nextCds
,
segments_on_target_genome
)[
0
:
int
(
variation
[
13
])
-
len_codonFragment_beforeNextVar_alt
]
#print("pos_var",variation[12],variation[13])
#print("next readingframe, frameshift",nextReadingframe,frameshift)
#print("len frag before",len_codonFragment_beforeNextVar_ref,len_codonFragment_beforeNextVar_alt)
#print("sequences",Features[nextCds].sequence,get_sequence_on_genome(nextCds,segments_on_target_genome))
var_found
=
True
return
print_variation_change
(
ref_sequence
,
alt_sequence
)
analysis
=
True
if
analysis
==
True
:
# analysing the variations for all the mrna :
...
...
@@ -270,28 +331,31 @@ if analysis==True:
leftover_ref
=
""
leftover_target
=
""
frameshift
=
0
readingframe
=
3
readingframe
=
0
cds_var
=
mrna_var
[
mrna_id
]
for
index_cds
,
cds_id
in
enumerate
(
cds_var
):
cds
=
Features
[
cds_id
]
print
(
"
\n
CDS
"
,
cds_id
,
"
:
"
)
print
(
"
readingframe
"
,
readingframe
,
"
frameshift
"
,
frameshift
)
for
index_var
,
var
in
enumerate
(
cds_var
[
cds_id
]):
# for each variation in the current cds :
type_var
=
var
[
8
]
if
type_var
!=
"
no_var
"
:
# if there is a variation
posVar
=
[
int
(
var
[
12
])
+
3
-
readingframe
,
int
(
var
[
13
])
+
3
-
readingframe
]
sequence_target
=
leftover_target
[
readingframe
:]
+
get_sequence_on_genome
(
cds_id
,
segments_on_target_genome
)
sequence_ref
=
leftover_ref
[
readingframe
:]
+
cds
.
sequence
print
(
"
\n
variation
"
,
index_var
+
1
,
"
:
"
)
length_ref
=
len
(
var
[
9
])
length_alt
=
len
(
var
[
10
])
if
type_var
==
"
insertion
"
:
length_ref
=
0
else
:
length_ref
=
len
(
var
[
9
])
if
type_var
==
"
deletion
"
:
print
(
"
insertion of
"
,
var
[
10
])
elif
type_var
==
"
deletion
"
:
length_alt
=
0
print
(
"
deletion of
"
,
var
[
9
])
else
:
length_alt
=
len
(
var
[
10
])
print
(
"
\n
variation
"
,
index_var
+
1
,
"
:
"
)
print
(
"
substitution of
"
,
var
[
9
],
"
by
"
,
var
[
10
])
posVar
=
[
int
(
var
[
12
])
+
(
3
-
readingframe
)
%
3
,
int
(
var
[
13
])
+
(
3
-
readingframe
)
%
3
]
sequenceCDS_target
=
leftover_target
[
readingframe
:]
+
get_sequence_on_genome
(
cds_id
,
segments_on_target_genome
)
sequenceCDS_source
=
leftover_ref
[
readingframe
:]
+
Features
[
cds_id
].
sequence
print
(
sequenceCDS_source
,
sequenceCDS_target
,
posVar
,
var
[
12
:
14
])
if
(
index_cds
==
0
)
&
(
posVar
[
0
]
<
3
):
print
(
"
variation of the start codon, mRNA most likely wont be translated
"
)
...
...
@@ -302,107 +366,21 @@ if analysis==True:
frameshift
=
(
frameshift
+
length_ref
-
length_alt
)
%
3
if
old_frameshift
!=
frameshift
:
print
(
"
variation causing a frameshift
"
)
# frameshift=0 -> reading frame recovered
. may need to get a base before.
# frameshift=0 -> reading frame recovered
# frameshift=1 -> frameshift of 1 base to the right
# frameshift=2 -> frameshift of 2 bases to the right
if
frameshift
==
0
:
print
(
"
recovery of the original reading frame
"
)
print
(
"
recovery of the original reading frame
"
)
if
old_frameshift
==
0
:
print
(
"
loss of the original reading frame
"
)
#else:
# print("variation not causing a frameshift")
stop
=
test
(
frameshift
,
readingframe
,
posVar
,
length_ref
,
length_alt
,
sequenceCDS_source
,
sequenceCDS_target
,
index_var
,
cds_var
,
cds_id
)
if
type_var
==
"
insertion
"
:
print
(
"
insertion of
"
,
var
[
10
])
elif
type_var
==
"
deletion
"
:
print
(
"
deletion of
"
,
var
[
9
])
else
:
print
(
"
substitution of
"
,
var
[
9
],
"
by
"
,
var
[
10
])
len_fragment_before_del
=
(
posVar
[
0
])
%
3
len_fragment_before_ins
=
(
posVar
[
1
])
%
3
if
frameshift
==
0
:
# print only the local change.
len_fragment_after_del
=
(
3
-
(
len_fragment_before_del
+
length_ref
))
%
3
len_fragment_after_ins
=
(
3
-
(
len_fragment_before_ins
+
length_alt
))
%
3
total_ins
=
sequence_target
[
posVar
[
1
]
-
len_fragment_before_ins
:
posVar
[
1
]
+
length_alt
+
len_fragment_after_ins
]
total_del
=
sequence_ref
[
posVar
[
0
]
-
len_fragment_before_del
:
posVar
[
0
]
+
length_ref
+
len_fragment_after_del
]
stop
=
print_variation_change
(
total_del
,
total_ins
)
if
stop
:
break
if
stop
:
break
else
:
# print changes from local var to next var. at the next var, we will see if the reading frame is recovered.
print
(
"
frameshift of
"
,
frameshift
,
"
base(s) to the right.
"
)
if
index_var
==
len
(
cds_var
[
cds_id
])
-
1
:
# it is the last variation of the cds. translate until the next var in the rest of the mrna
total_total_del
=
sequence_ref
[
posVar
[
0
]
-
len_fragment_before_del
:]
total_total_ins
=
sequence_target
[
posVar
[
1
]
-
len_fragment_before_ins
:]
del_leftover_len
=
len
(
total_total_del
)
%
3
ins_leftover_len
=
len
(
total_total_ins
)
%
3
# if it is not the last cds, add the modifications until the next variation.
# get the leftover from this cds, and check the next one for the rest.
# for this, go through all the cds left, and add the sequence until the first variation encountered.
if
index_cds
!=
len
(
cds_var
)
-
1
:
# if this is not the last cds
# add the sequence until the next variation
var_found
=
False
keys
=
list
(
cds_var
.
keys
())
cds_rank
=
keys
.
index
(
cds_id
)
cds_rank
+=
1
nextCds
=
keys
[
cds_rank
]
while
(
not
var_found
)
&
(
keys
.
index
(
nextCds
)
!=
len
(
keys
)):
# vérifier qu'on dépasse pas
if
len
(
cds_var
[
nextCds
])
==
0
:
# if the current cds has no variation, simply add its sequence
total_total_del
+=
Features
[
nextCds
].
sequence
total_total_ins
+=
get_sequence_on_genome
(
nextCds
,
segments_on_target_genome
)
cds_rank
+=
1
nextCds
=
keys
[
cds_rank
]
print
(
"
no var in current cds
"
)
else
:
# variation found
variation
=
cds_var
[
nextCds
][
0
]
nextReadingframe
=
(
3
-
(
len
(
sequence_ref
)
-
readingframe
))
%
3
len_fragment_before_del_nextvar
=
(
int
(
variation
[
12
])
-
nextReadingframe
)
%
3
len_fragment_before_ins_nextvar
=
(
int
(
variation
[
13
])
+
(
3
-
frameshift
)
-
nextReadingframe
)
%
3
total_total_del
+=
Features
[
nextCds
].
sequence
[
0
:
int
(
variation
[
12
])
-
len_fragment_before_del_nextvar
]
total_total_ins
+=
get_sequence_on_genome
(
nextCds
,
segments_on_target_genome
)[
0
:
int
(
variation
[
13
])
-
len_fragment_before_ins_nextvar
]
#print("pos_var",variation[12],variation[13])
#print("next readingframe, frameshift",nextReadingframe,frameshift)
#print("len frag before",len_fragment_before_del_nextvar,len_fragment_before_ins_nextvar)
#print("sequences",Features[nextCds].sequence,get_sequence_on_genome(nextCds,segments_on_target_genome))
var_found
=
True
stop
=
print_variation_change
(
total_total_del
,
total_total_ins
)
if
stop
:
break
else
:
# not the last variation. translate until the next var.
nextVar
=
cds_var
[
cds_id
][
index_var
+
1
]
posNextVar
=
[
int
(
nextVar
[
12
]),
int
(
nextVar
[
13
])]
if
nextVar
[
8
]
==
"
insertion
"
:
length_ref_nextvar
=
0
else
:
length_ref_nextvar
:
len
(
nextVar
[
9
])
if
nextVar
[
8
]
==
"
deletion
"
:
length_alt_nextvar
=
0
else
:
length_alt_nextvar
=
len
(
nextVar
[
10
])
len_fragment_before_del_nextvar
=
(
posNextVar
[
0
])
%
3
len_fragment_before_ins_nextvar
=
(
posNextVar
[
1
])
%
3
total_total_del
=
sequence_ref
[
posVar
[
0
]
-
len_fragment_before_del
:
posNextVar
[
0
]
-
len_fragment_before_del_nextvar
]
total_total_ins
=
sequence_target
[
posVar
[
1
]
-
len_fragment_before_ins
:
posNextVar
[
1
]
-
len_fragment_before_ins_nextvar
]
stop
=
print_variation_change
(
total_total_del
,
total_total_ins
)
if
stop
:
break
if
index_var
==
len
(
cds_var
[
cds_id
])
-
1
:
# it is the last variation, get the end of the cds
leftover_ref
=
sequence
_ref
[
len
(
sequence
_ref
)
-
3
:]
leftover_target
=
sequence_target
[
len
(
sequence_target
)
-
3
:]
readingframe
=
(
3
-
(
len
(
sequence
_ref
)
-
readingframe
))
%
3
leftover_ref
=
sequence
CDS_source
[
len
(
sequence
CDS_source
)
-
3
:]
leftover_target
=
sequence
CDS
_target
[
len
(
sequence
CDS
_target
)
-
3
:]
readingframe
=
(
3
-
(
len
(
sequence
CDS_source
)
-
readingframe
))
%
3
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment