Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
GrAnnoT
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
DIADE
dynadiv
GrAnnoT
Commits
2e5f4d81
Commit
2e5f4d81
authored
1 year ago
by
nina.marthe_ird.fr
Browse files
Options
Downloads
Plain Diff
Merge branch 'inferences'
parents
1b5825d6
1ad1a7d9
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
getSegmentsCoordinates.py
+1
-1
1 addition, 1 deletion
getSegmentsCoordinates.py
inference.py
+126
-102
126 additions, 102 deletions
inference.py
with
127 additions
and
103 deletions
getSegmentsCoordinates.py
+
1
−
1
View file @
2e5f4d81
...
...
@@ -84,7 +84,7 @@ for line in lines :
seg_start
=
position
seg_name
=
'
s
'
+
path
[
i
][
1
:]
seg_stop
=
position
+
segments_size
[
seg_name
]
-
1
seg_stop
=
position
+
segments_size
[
seg_name
]
if
path
[
i
][
0
:
1
]
==
'
>
'
:
orientation
=
'
+
'
...
...
This diff is collapsed.
Click to expand it.
inference.py
+
126
−
102
View file @
2e5f4d81
...
...
@@ -53,10 +53,12 @@ def get_segments_positions_on_genome(pos_seg): # creates a dict with the positio
def
add_feature_sequence
(
feature
,
seg_seq
):
# add the feature's sequence in the Feature object
feature_sequence
=
""
for
segment
in
feature
.
segments_list
:
if
(
segment
==
feature
.
segments_list
[
0
])
&
(
segment
==
feature
.
segments_list
[
-
1
]):
# the segment is the first AND the last of the feature
feature_sequence
=
get_segment_sequence
(
seg_seq
,
segment
)[
feature
.
pos_start
-
2
:
feature
.
pos_stop
-
1
]
if
segment
==
feature
.
segments_list
[
0
]:
feature_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
feature
.
pos_start
-
2
:]
# revérifier les +/- 1 pour la position, avec de vraies données
elif
segment
==
feature
.
segments_list
[
-
1
]:
feature_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
0
:
feature
.
pos_stop
]
# revérifier les +/- 1 pour la position, avec de vraies données
feature_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
0
:
feature
.
pos_stop
-
1
]
# revérifier les +/- 1 pour la position, avec de vraies données
else
:
feature_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)
feature
.
sequence
=
feature_sequence
...
...
@@ -171,26 +173,31 @@ def get_sequence_on_genome(feature,segments_on_target_genome): # returns the seq
if
segment
==
cds
.
segments_list
[
0
]:
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
cds
.
pos_start
-
2
:]
elif
segment
==
cds
.
segments_list
[
-
1
]:
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
0
:
cds
.
pos_stop
]
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
0
:
cds
.
pos_stop
-
1
]
else
:
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)
return
new_sequence
def
get_
cds_variations
(
var_file
):
# creates dict : cds_id -> list of variations from the var file
def
get_
mrna_var
(
var_file
):
var
=
open
(
var_file
,
'
r
'
)
lines
=
var
.
readlines
()
var
.
close
()
# dict cds-var
cds_var
=
{}
mrna_var
=
{}
for
line
in
lines
:
line
=
line
.
split
()
if
line
[
1
]
==
"
CDS
"
:
feature_type
=
line
[
1
]
if
feature_type
==
"
mRNA
"
:
mrna_id
=
line
[
0
].
replace
(
'
.
'
,
'
_
'
).
replace
(
'
:
'
,
'
_
'
)
if
mrna_id
not
in
mrna_var
.
keys
():
mrna_var
[
mrna_id
]
=
{}
elif
feature_type
==
"
CDS
"
:
cds_id
=
line
[
0
].
replace
(
'
.
'
,
'
_
'
).
replace
(
'
:
'
,
'
_
'
)
if
cds_id
not
in
cds_var
.
keys
():
cds_var
[
cds_id
]
=
list
()
cds_var
[
cds_id
].
append
(
line
)
return
cds_var
parent_id
=
line
[
0
][
0
:
line
[
0
].
find
(
"
cds
"
)
-
1
]
if
cds_id
not
in
mrna_var
[
parent_id
]:
mrna_var
[
parent_id
][
cds_id
]
=
list
()
mrna_var
[
parent_id
][
cds_id
].
append
(
line
)
return
mrna_var
import
re
def
findOtherStart
(
cds
,
segments_on_target_genome
):
# look for another start codon, before or after the original one
...
...
@@ -247,104 +254,121 @@ def print_variation_change(deleted_sequence,inserted_sequence): # print the cons
[
paths
,
seg_seq
]
=
get_segments_sequence_and_paths
(
gfa
)
segments_on_target_genome
=
get_segments_positions_on_genome
(
pos_seg
)
cds
_var
=
get_
cds_variations
(
var_file
)
mrna
_var
=
get_
mrna_var
(
var_file
)
for
feature
in
Features
.
values
():
# add the sequence of all features
add_feature_sequence
(
feature
,
seg_seq
)
# analysing the variations for all the cds :
for
cds_id
in
cds_var
.
keys
():
cds
=
Features
[
cds_id
]
print
(
"
analysis of the variations in the CDS
"
,
cds_id
,
"
:
\n
"
)
frameshift
=
0
for
index
,
var
in
enumerate
(
cds_var
[
cds_id
]):
# for each variation in the current cds :
type_var
=
var
[
8
]
if
type_var
!=
"
no_var
"
:
# if there is a variation
posVar
=
[
int
(
var
[
12
]),
int
(
var
[
13
])]
sequence_target
=
get_sequence_on_genome
(
cds_id
,
segments_on_target_genome
)
if
type_var
==
"
insertion
"
:
length_ref
=
0
else
:
length_ref
=
len
(
var
[
9
])
if
type_var
==
"
deletion
"
:
length_alt
=
0
else
:
length_alt
=
len
(
var
[
10
])
print
(
"
variation
"
,
index
+
1
,
"
:
"
)
if
posVar
[
0
]
<=
3
:
print
(
"
variation of the start codon, mRNA most likely wont be translated
"
)
#findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon
break
old_frameshift
=
frameshift
frameshift
=
(
frameshift
+
length_ref
-
length_alt
)
%
3
if
old_frameshift
!=
frameshift
:
print
(
"
variation causing a frameshift
"
)
# frameshift=0 -> reading frame recovered. may need to get a base before.
# frameshift=1 -> frameshift of 1 base to the right
# frameshift=2 -> frameshift of 2 bases to the right
if
frameshift
==
0
:
print
(
"
recovery of the original reading frame
"
)
if
old_frameshift
==
0
:
print
(
"
loss of the original reading frame
"
)
#else:
# print("variation not causing a frameshift")
if
type_var
==
"
insertion
"
:
print
(
"
insertion of
"
,
var
[
10
])
elif
type_var
==
"
deletion
"
:
print
(
"
deletion of
"
,
var
[
9
])
else
:
print
(
"
substitution of
"
,
var
[
9
],
"
by
"
,
var
[
10
])
analysis
=
True
if
analysis
==
True
:
# analysing the variations for all the mrna :
for
mrna_id
in
mrna_var
.
keys
():
print
(
"
analysis of the variations in the mRNA
"
,
mrna_id
,
"
:
\n
"
)
leftover_ref
=
""
leftover_target
=
""
frameshift
=
0
cds_var
=
mrna_var
[
mrna_id
]
for
index_cds
,
cds_id
in
enumerate
(
cds_var
):
cds
=
Features
[
cds_id
]
print
(
"
\n
CDS
"
,
cds_id
,
"
:
"
)
for
index_var
,
var
in
enumerate
(
cds_var
[
cds_id
]):
# for each variation in the current cds :
type_var
=
var
[
8
]
if
type_var
!=
"
no_var
"
:
# if there is a variation
posVar
=
[
int
(
var
[
12
]),
int
(
var
[
13
])]
sequence_target
=
get_sequence_on_genome
(
cds_id
,
segments_on_target_genome
)
if
type_var
==
"
insertion
"
:
length_ref
=
0
else
:
length_ref
=
len
(
var
[
9
])
if
type_var
==
"
deletion
"
:
length_alt
=
0
else
:
length_alt
=
len
(
var
[
10
])
print
(
"
\n
variation
"
,
index_var
+
1
,
"
:
"
)
if
(
index_cds
==
0
)
&
(
posVar
[
0
]
<
3
):
print
(
"
variation of the start codon, mRNA most likely wont be translated
"
)
#findOtherStart(cds,segments_on_target_genome) # for now we don't look for another start codon
break
len_fragment_before_del
=
(
posVar
[
0
])
%
3
len_fragment_before_ins
=
(
posVar
[
1
])
%
3
old_frameshift
=
frameshift
frameshift
=
(
frameshift
+
length_ref
-
length_alt
)
%
3
if
old_frameshift
!=
frameshift
:
print
(
"
variation causing a frameshift
"
)
# frameshift=0 -> reading frame recovered. may need to get a base before.
# frameshift=1 -> frameshift of 1 base to the right
# frameshift=2 -> frameshift of 2 bases to the right
if
frameshift
==
0
:
print
(
"
recovery of the original reading frame
"
)
if
old_frameshift
==
0
:
print
(
"
loss of the original reading frame
"
)
#else:
# print("variation not causing a frameshift")
if
type_var
==
"
insertion
"
:
print
(
"
insertion of
"
,
var
[
10
])
elif
type_var
==
"
deletion
"
:
print
(
"
deletion of
"
,
var
[
9
])
else
:
print
(
"
substitution of
"
,
var
[
9
],
"
by
"
,
var
[
10
])
if
frameshift
==
0
:
# print only the local change.
len_fragment_after_del
=
(
3
-
(
len_fragment_before_del
+
length_ref
))
%
3
len_fragment_after_ins
=
(
3
-
(
len_fragment_before_ins
+
length_alt
))
%
3
total_ins
=
sequence_target
[
posVar
[
1
]
-
len_fragment_before_ins
:
posVar
[
1
]
+
length_alt
+
len_fragment_after_ins
]
total_del
=
cds
.
sequence
[
posVar
[
0
]
-
len_fragment_before_del
:
posVar
[
0
]
+
length_ref
+
len_fragment_after_del
]
stop
=
print_variation_change
(
total_del
,
total_ins
)
if
stop
:
break
len_fragment_before_del
=
(
posVar
[
0
])
%
3
len_fragment_before_ins
=
(
posVar
[
1
])
%
3
else
:
# print changes from local var to next var. at the next var, we will see if the reading frame is recovered.
print
(
"
frameshift of
"
,
frameshift
,
"
base(s) to the right.
"
)
if
index
==
len
(
cds_var
[
cds_id
])
-
1
:
# it is the last variation. translate until the end of the cds.
total_total_del
=
cds
.
sequence
[
posVar
[
0
]
-
len_fragment_before_del
:]
total_total_ins
=
sequence_target
[
posVar
[
1
]
-
len_fragment_before_ins
:]
stop
=
print_variation_change
(
total_total_del
,
total_total_ins
)
if
stop
:
break
else
:
# not the last variation. translate until the next var.
nextVar
=
cds_var
[
cds_id
][
index
+
1
]
posNextVar
=
[
int
(
nextVar
[
12
]),
int
(
nextVar
[
13
])]
if
frameshift
==
0
:
# print only the local change.
len_fragment_after_del
=
(
3
-
(
len_fragment_before_del
+
length_ref
))
%
3
len_fragment_after_ins
=
(
3
-
(
len_fragment_before_ins
+
length_alt
))
%
3
total_ins
=
sequence_target
[
posVar
[
1
]
-
len_fragment_before_ins
:
posVar
[
1
]
+
length_alt
+
len_fragment_after_ins
]
total_del
=
cds
.
sequence
[
posVar
[
0
]
-
len_fragment_before_del
:
posVar
[
0
]
+
length_ref
+
len_fragment_after_del
]
stop
=
print_variation_change
(
total_del
,
total_ins
)
if
stop
:
break
if
nextVar
[
8
]
==
"
insertion
"
:
length_ref_nextvar
=
0
else
:
length_ref_nextvar
:
len
(
nextVar
[
9
])
if
nextVar
[
8
]
==
"
deletion
"
:
length_alt_nextvar
=
0
else
:
length_alt_nextvar
=
len
(
nextVar
[
10
])
len_fragment_before_del_nextvar
=
(
posNextVar
[
0
])
%
3
len_fragment_before_ins_nextvar
=
(
posNextVar
[
1
])
%
3
total_total_del
=
cds
.
sequence
[
posVar
[
0
]
-
len_fragment_before_del
:
posNextVar
[
0
]
-
len_fragment_before_del_nextvar
]
total_total_ins
=
sequence_target
[
posVar
[
1
]
-
len_fragment_before_ins
:
posNextVar
[
1
]
-
len_fragment_before_ins_nextvar
]
stop
=
print_variation_change
(
total_total_del
,
total_total_ins
)
if
stop
:
break
# possible that it prints too many variations : for ex if we have a snp on the first and the last base of a codon,
# while printing the effect of the first snp we aso use the second one.
# print changes from local var to next var. at the next var, we will see if the reading frame is recovered.
print
(
"
frameshift of
"
,
frameshift
,
"
base(s) to the right.
"
)
if
index_var
==
len
(
cds_var
[
cds_id
])
-
1
:
# it is the last variation. translate until the end of the cds while having full codons
total_total_del
=
cds
.
sequence
[
posVar
[
0
]
-
len_fragment_before_del
:]
print
(
cds
.
sequence
,
sequence_target
)
total_total_ins
=
sequence_target
[
posVar
[
1
]
-
len_fragment_before_ins
:]
# supprimer taille%3 des sequences ins et del.
# en fait non. il faut récupérer les séquences sur les seq des cds (sans l'intron dcp), prendre la taille sur ça et supprimer sur ça
del_leftover_len
=
len
(
total_total_del
)
%
3
ins_leftover_len
=
len
(
total_total_ins
)
%
3
if
del_leftover_len
!=
0
:
total_total_del
=
total_total_del
[
0
:
-
del_leftover_len
]
if
ins_leftover_len
!=
0
:
total_total_ins
=
total_total_ins
[
0
:
-
ins_leftover_len
]
stop
=
print_variation_change
(
total_total_del
,
total_total_ins
)
if
stop
:
break
else
:
# not the last variation. translate until the next var.
nextVar
=
cds_var
[
cds_id
][
index_var
+
1
]
posNextVar
=
[
int
(
nextVar
[
12
]),
int
(
nextVar
[
13
])]
if
nextVar
[
8
]
==
"
insertion
"
:
length_ref_nextvar
=
0
else
:
length_ref_nextvar
:
len
(
nextVar
[
9
])
if
nextVar
[
8
]
==
"
deletion
"
:
length_alt_nextvar
=
0
else
:
length_alt_nextvar
=
len
(
nextVar
[
10
])
len_fragment_before_del_nextvar
=
(
posNextVar
[
0
])
%
3
len_fragment_before_ins_nextvar
=
(
posNextVar
[
1
])
%
3
total_total_del
=
cds
.
sequence
[
posVar
[
0
]
-
len_fragment_before_del
:
posNextVar
[
0
]
-
len_fragment_before_del_nextvar
]
total_total_ins
=
sequence_target
[
posVar
[
1
]
-
len_fragment_before_ins
:
posNextVar
[
1
]
-
len_fragment_before_ins_nextvar
]
stop
=
print_variation_change
(
total_total_del
,
total_total_ins
)
if
stop
:
break
if
index_var
==
len
(
cds_var
[
cds_id
])
-
1
:
# it is the last variation, get the end of the cds
leftover_ref
=
cds
.
sequence
[
len
(
cds
.
sequence
)
-
3
:]
leftover_target
=
sequence_target
[
len
(
sequence_target
)
-
3
:]
print
(
"
\n
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment