Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
GrAnnoT
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
DIADE
dynadiv
GrAnnoT
Commits
05f58532
Commit
05f58532
authored
1 year ago
by
nina.marthe_ird.fr
Browse files
Options
Downloads
Patches
Plain Diff
commencé à traité les variations sur le codon start
parent
5c80b92f
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
inference.py
+152
-76
152 additions, 76 deletions
inference.py
with
152 additions
and
76 deletions
inference.py
+
152
−
76
View file @
05f58532
from
Graph_gff
import
Features
,
load_intersect
from
Graph_gff
import
Features
,
load_intersect
from
Functions
import
get_segment_sequence
,
convert_strand
from
Functions
import
get_segment_sequence
,
convert_strand
target_genome_name
=
"
genome
3
_chr10
"
target_genome_name
=
"
genome
4
_chr10
"
intersect_path
=
'
/home/nina/annotpangenome/test_data/input_data_inf/intersect.bed
'
intersect_path
=
'
/home/nina/annotpangenome/test_data/input_data_inf/intersect.bed
'
load_intersect
(
intersect_path
)
load_intersect
(
intersect_path
)
...
@@ -58,7 +58,7 @@ def add_feature_sequence(feature,seg_seq):
...
@@ -58,7 +58,7 @@ def add_feature_sequence(feature,seg_seq):
feature_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)
feature_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)
feature
.
sequence
=
feature_sequence
feature
.
sequence
=
feature_sequence
def
get_first_seg
(
list_seg
):
def
get_first_seg
(
list_seg
,
segments_on_target_genome
):
first_seg_found
=
''
first_seg_found
=
''
for
segment
in
list_seg
:
for
segment
in
list_seg
:
if
segment
[
1
:]
in
segments_on_target_genome
:
if
segment
[
1
:]
in
segments_on_target_genome
:
...
@@ -164,19 +164,7 @@ def decoupe_codon(sequence):
...
@@ -164,19 +164,7 @@ def decoupe_codon(sequence):
var
=
open
(
"
test_data/variations.txt
"
,
'
r
'
)
lines
=
var
.
readlines
()
var
.
close
()
# dict cds-var
cds_var
=
{}
for
line
in
lines
:
line
=
line
.
split
()
if
line
[
1
]
==
"
CDS
"
:
cds_id
=
line
[
0
].
replace
(
'
.
'
,
'
_
'
).
replace
(
'
:
'
,
'
_
'
)
if
cds_id
not
in
cds_var
.
keys
():
cds_var
[
cds_id
]
=
list
()
cds_var
[
cds_id
].
append
(
line
)
def
get_sequence_before
(
first_seg
,
seg_seq
,
n
,
paths
,
feat
):
def
get_sequence_before
(
first_seg
,
seg_seq
,
n
,
paths
,
feat
):
first_strand
=
convert_strand
(
first_seg
[
0
])
first_strand
=
convert_strand
(
first_seg
[
0
])
...
@@ -219,15 +207,11 @@ def get_sequence_after(last_seg,seg_seq,n,paths,feat):
...
@@ -219,15 +207,11 @@ def get_sequence_after(last_seg,seg_seq,n,paths,feat):
list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
list_segfeat_azu_corrected=[convert_strand(segment_stranded[0])+segment_stranded[1:] for segment_stranded in list_segfeat_azu]
'''
'''
for
cds_id
in
cds_var
.
keys
():
# for a gene that has cds, concatenate all cds to make a prot. then detail var by cds.
cds
=
Features
[
cds_id
]
print
(
"
analysing variations in cds
"
,
cds_id
)
add_feature_sequence
(
cds
,
seg_seq
)
cds_prot
=
traduction
(
get_rna
(
cds
.
sequence
))
list_seg
=
Features
[
cds_id
].
segments_list
def
get_sequence_on_genome
(
feature
,
segments_on_target_genome
):
first_seg
=
get_first_seg
(
list_seg
)
list_seg
=
Features
[
feature
].
segments_list
last_seg
=
get_first_seg
(
reversed
(
list_seg
))
first_seg
=
get_first_seg
(
list_seg
,
segments_on_target_genome
)
last_seg
=
get_first_seg
(
reversed
(
list_seg
),
segments_on_target_genome
)
path_on_target
=
get_feature_path
(
paths
,
first_seg
,
last_seg
)
path_on_target
=
get_feature_path
(
paths
,
first_seg
,
last_seg
)
new_sequence
=
""
new_sequence
=
""
...
@@ -238,76 +222,168 @@ for cds_id in cds_var.keys(): # for a gene that has cds, concatenate all cds to
...
@@ -238,76 +222,168 @@ for cds_id in cds_var.keys(): # for a gene that has cds, concatenate all cds to
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
0
:
cds
.
pos_stop
]
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
0
:
cds
.
pos_stop
]
else
:
else
:
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)
return
new_sequence
new_prot
=
traduction
(
get_rna
(
new_sequence
))
print
(
"
original prot =
"
,
cds_prot
)
print
(
"
new prot =
"
,
new_prot
)
# print new version with new start and stop codons of deleted. (before and after the gene sequence)
var
=
open
(
"
test_data/variations.txt
"
,
'
r
'
)
lines
=
var
.
readlines
()
var
.
close
()
if
new_prot
[
0
]
!=
"
Met
"
:
# dict cds-var
print
(
"
no Met at the start of the new version of the protein -> start codon loss
"
)
cds_var
=
{}
if
"
Met
"
in
new_prot
:
for
line
in
lines
:
print
(
"
Met found at position
"
,
(
new_prot
.
index
(
"
Met
"
)
+
1
),
"
-> possible later start codon
"
)
# print cette version de la prot
line
=
line
.
split
()
if
line
[
1
]
==
"
CDS
"
:
print
(
"
look for start codon before. if none, print
'
no start codon, likely gene not active
'"
)
cds_id
=
line
[
0
].
replace
(
'
.
'
,
'
_
'
).
replace
(
'
:
'
,
'
_
'
)
# récupérer n pb avant, les traduire, chercher la dernière Met (list[::-1].index("Met")), donner cette version de la prot
if
cds_id
not
in
cds_var
.
keys
():
cds_var
[
cds_id
]
=
list
()
sequence_before
=
get_sequence_before
(
path_on_target
[
0
],
seg_seq
,
100
,
paths
,
cds
)
cds_var
[
cds_id
].
append
(
line
)
print
(
sequence_before
)
first_stop_index
=
new_prot
.
index
(
"
*
"
)
if
"
*
"
in
new_prot
else
"
None
"
for
feature
in
Features
.
values
():
if
"
*
"
not
in
new_prot
:
add_feature_sequence
(
feature
,
seg_seq
)
print
(
"
no stop codon
"
)
# récupérer n pb après, les traduire, chercher le premier *, donner cette version de la prot.
elif
first_stop_index
+
1
!=
len
(
new_prot
):
print
(
"
early stop codon at position
"
,(
first_stop_index
+
1
),
"
instead of
"
,
len
(
new_prot
))
else
:
print
(
"
stop codon found at expected position
"
)
sequence_after
=
get_sequence_after
(
path_on_target
[
-
1
],
seg_seq
,
100
,
paths
,
cds
)
print
(
sequence_after
)
version
=
"
new
"
import
re
for
cds_id
in
cds_var
.
keys
():
cds
=
Features
[
cds_id
]
print
(
"
analysing variations in cds
"
,
cds_id
)
for
var
in
cds_var
[
cds_id
]:
for
var
in
cds_var
[
cds_id
]:
#print("\n",var)
size_var
=
int
(
var
[
11
])
size_var
=
int
(
var
[
11
])
type_var
=
var
[
8
]
type_var
=
var
[
8
]
pos_var
=
int
(
var
[
12
])
-
1
pos_var
=
int
(
var
[
12
])
-
1
if
pos_var
<=
3
:
print
(
"
codon start touché
"
)
#chercher start (dans le mrna) avant et apres
# chercher codon start en amont du cds, dans le mrna
seq_parent
=
get_sequence_on_genome
(
cds
.
parent
,
segments_on_target_genome
)
seq_cds
=
get_sequence_on_genome
(
cds_id
,
segments_on_target_genome
)
sequence_amont
=
seq_parent
[
0
:
seq_parent
.
rfind
(
seq_cds
)]
# get the mrna sequence before the last occurence of the cds sequence
print
(
sequence_amont
)
if
"
ATG
"
in
sequence_amont
:
start_pos_list
=
[
m
.
start
()
for
m
in
re
.
finditer
(
'
(?=ATG)
'
,
sequence_amont
)]
stop_pos_list
=
[
m
.
start
()
for
m
in
re
.
finditer
(
'
(?=TAG|TAA|TGA)
'
,
sequence_amont
)]
print
(
start_pos_list
,
stop_pos_list
)
# positions (overlapping) où on trouve un atg.
# vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
for
start_pos
in
start_pos_list
:
print
(
start_pos
)
start_pos_frame
=
start_pos
%
3
test_list
=
[
39
,
46
,
47
]
if
True
not
in
(
(
stop_pos
%
3
==
start_pos_frame
)
&
(
stop_pos
>
start_pos
)
for
stop_pos
in
stop_pos_list
)
:
n
=
len
(
sequence_amont
)
-
start_pos
print
(
"
codon start candidat trouvé dans l
'
arn messager,
"
,
n
,
"
bases en amont du cds
"
)
# calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !
frame_shift
=
n
%
3
# mais si on a plusieurs start candidats il faut choisir celui qu'on prend !??
# chercher codon start en aval, dans le cds
if
"
ATG
"
in
seq_cds
:
start_pos_list
=
[
m
.
start
()
for
m
in
re
.
finditer
(
'
(?=ATG)
'
,
seq_cds
)]
for
pos
in
start_pos_list
:
print
(
"
codon start candidat trouvé plus loin dans le cds, à la base
"
,
pos
)
if
version
==
"
old
"
:
for
cds_id
in
cds_var
.
keys
():
# for a gene that has cds, concatenate all cds to make a prot. then detail var by cds.
cds
=
Features
[
cds_id
]
print
(
"
analysing variations in cds
"
,
cds_id
)
add_feature_sequence
(
cds
,
seg_seq
)
cds_prot
=
traduction
(
get_rna
(
cds
.
sequence
))
list_seg
=
Features
[
cds_id
].
segments_list
first_seg
=
get_first_seg
(
list_seg
)
last_seg
=
get_first_seg
(
reversed
(
list_seg
))
path_on_target
=
get_feature_path
(
paths
,
first_seg
,
last_seg
)
new_sequence
=
""
for
segment
in
path_on_target
:
if
segment
==
cds
.
segments_list
[
0
]:
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
cds
.
pos_start
-
1
:]
elif
segment
==
cds
.
segments_list
[
-
1
]:
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)[
0
:
cds
.
pos_stop
]
else
:
new_sequence
+=
get_segment_sequence
(
seg_seq
,
segment
)
new_prot
=
traduction
(
get_rna
(
new_sequence
))
print
(
"
original prot =
"
,
cds_prot
)
print
(
"
new prot =
"
,
new_prot
)
# print new version with new start and stop codons of deleted. (before and after the gene sequence)
if
new_prot
[
0
]
!=
"
Met
"
:
print
(
"
no Met at the start of the new version of the protein -> start codon loss
"
)
if
"
Met
"
in
new_prot
:
print
(
"
Met found at position
"
,
(
new_prot
.
index
(
"
Met
"
)
+
1
),
"
-> possible later start codon
"
)
# print cette version de la prot
print
(
"
look for start codon before. if none, print
'
no start codon, likely gene not active
'"
)
# récupérer n pb avant, les traduire, chercher la dernière Met (list[::-1].index("Met")), donner cette version de la prot
sequence_before
=
get_sequence_before
(
path_on_target
[
0
],
seg_seq
,
100
,
paths
,
cds
)
print
(
sequence_before
)
first_stop_index
=
new_prot
.
index
(
"
*
"
)
if
"
*
"
in
new_prot
else
"
None
"
if
"
*
"
not
in
new_prot
:
print
(
"
no stop codon
"
)
# récupérer n pb après, les traduire, chercher le premier *, donner cette version de la prot.
elif
first_stop_index
+
1
!=
len
(
new_prot
):
print
(
"
early stop codon at position
"
,(
first_stop_index
+
1
),
"
instead of
"
,
len
(
new_prot
))
else
:
print
(
"
stop codon found at expected position
"
)
sequence_after
=
get_sequence_after
(
path_on_target
[
-
1
],
seg_seq
,
100
,
paths
,
cds
)
print
(
sequence_after
)
for
var
in
cds_var
[
cds_id
]:
#print("\n",var)
size_var
=
int
(
var
[
11
])
type_var
=
var
[
8
]
pos_var
=
int
(
var
[
12
])
-
1
if
type_var
==
"
insertions
"
:
if
type_var
==
"
insertions
"
:
sequence_var
=
var
[
10
]
sequence_var
=
var
[
10
]
if
size_var
%
3
==
0
:
if
size_var
%
3
==
0
:
print
(
"
pas de décalage du cadre de lecture
"
)
print
(
"
pas de décalage du cadre de lecture
"
)
trad_seq_ins
=
traduction
(
get_rna
(
sequence_var
))
trad_seq_ins
=
traduction
(
get_rna
(
sequence_var
))
if
pos_var
%
3
==
0
:
if
pos_var
%
3
==
0
:
print
(
"
insertion entre deux codons
"
)
print
(
"
insertion entre deux codons
"
)
if
"
*
"
in
trad_seq_ins
:
if
"
*
"
in
trad_seq_ins
:
print
(
"
apparition d
'
un codon stop
"
)
print
(
"
apparition d
'
un codon stop
"
)
print
(
f
'
ancienne sequence :
{
"
,
"
.
join
(
cds_prot
)
}
'
)
print
(
f
'
ancienne sequence :
{
"
,
"
.
join
(
cds_prot
)
}
'
)
print
(
f
'
nouvelle sequence :
{
"
,
"
.
join
(
cds_prot
[
0
:
(
pos_var
//
3
)
-
1
])
}
, *
'
)
print
(
f
'
nouvelle sequence :
{
"
,
"
.
join
(
cds_prot
[
0
:
(
pos_var
//
3
)
-
1
])
}
, *
'
)
else
:
print
(
f
'
ancienne sequence :
{
"
,
"
.
join
(
cds_prot
)
}
'
)
print
(
f
'
{
type_var
}
de
{
size_var
//
3
}
acides amines
{
"
,
"
.
join
(
trad_seq_ins
)
}
à la position
{
pos_var
//
3
}
'
)
else
:
else
:
print
(
f
'
ancienne sequence :
{
"
,
"
.
join
(
cds_prot
)
}
'
)
print
(
"
insertion au milieu d
'
un codon, changement de certains acides amines
"
)
print
(
f
'
{
type_var
}
de
{
size_var
//
3
}
acides amines
{
"
,
"
.
join
(
trad_seq_ins
)
}
à la position
{
pos_var
//
3
}
'
)
else
:
elif
type_var
==
"
deletions
"
:
print
(
"
insertion au milieu d
'
un codon, changement de certains acides amines
"
)
sequence_var
=
line
[
9
]
if
size_var
%
3
==
0
:
elif
type_var
==
"
dele
tions
"
:
print
(
"
pas de décalage du cadre
de
le
cture
"
)
sequence_var
=
line
[
9
]
trad_seq_ins
=
traduction
(
get_rna
(
sequence_var
))
if
size_var
%
3
==
0
:
print
(
"
pas de décalage du cadre de lecture
"
)
if
pos_var
%
3
==
0
:
trad_seq_ins
=
traduction
(
get_rna
(
sequence_var
)
)
print
(
"
deletion de codons entiers
"
)
if
"
*
"
in
trad_seq_ins
:
if
pos_var
%
3
==
0
:
print
(
"
disparition d
'
un codon stop
"
)
print
(
"
deletion de codons entiers
"
)
else
:
if
"
*
"
in
trad_seq_ins
:
print
(
f
'
ancienne sequence :
{
"
,
"
.
join
(
cds_prot
)
}
'
)
print
(
"
disparition d
'
un codon stop
"
)
print
(
f
'
{
type_var
}
de
{
size_var
//
3
}
acides amines
{
"
,
"
.
join
(
trad_seq_ins
)
}
à la position
{
pos_var
//
3
}
'
)
else
:
else
:
print
(
f
'
ancienne sequence :
{
"
,
"
.
join
(
cds_prot
)
}
'
)
print
(
"
deletion au milieu d
'
un codon, changement de certains acides amines
"
)
print
(
f
'
{
type_var
}
de
{
size_var
//
3
}
acides amines
{
"
,
"
.
join
(
trad_seq_ins
)
}
à la position
{
pos_var
//
3
}
'
)
else
:
print
(
"
deletion au milieu d
'
un codon, changement de certains acides amines
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment