Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
GrAnnoT
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
DIADE
dynadiv
GrAnnoT
Commits
b8937efc
Commit
b8937efc
authored
1 year ago
by
nina.marthe_ird.fr
Browse files
Options
Downloads
Patches
Plain Diff
continué le traitement des variations
parent
1a9d6188
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
inference.py
+112
-39
112 additions, 39 deletions
inference.py
with
112 additions
and
39 deletions
inference.py
+
112
−
39
View file @
b8937efc
...
@@ -3,10 +3,14 @@ from Functions import get_segment_sequence,convert_strand
...
@@ -3,10 +3,14 @@ from Functions import get_segment_sequence,convert_strand
target_genome_name
=
"
genome4_chr10
"
target_genome_name
=
"
genome4_chr10
"
intersect_path
=
'
/home/nina/annotpangenome/test_data/input_data_inf/
intersect.bed
'
intersect_path
=
'
intersect.bed
'
load_intersect
(
intersect_path
)
load_intersect
(
intersect_path
)
gfa
=
"
test_data/input_data_inf/graph_test.gfa
"
pos_seg
=
"
genome4_chr10.bed
"
var
=
open
(
"
genome4_chr10_variations.txt
"
,
'
r
'
)
gfa
=
"
graph.gfa
"
def
get_segments_sequence_and_paths
(
gfa
):
def
get_segments_sequence_and_paths
(
gfa
):
file_gfa
=
open
(
gfa
,
'
r
'
)
file_gfa
=
open
(
gfa
,
'
r
'
)
lines_gfa
=
file_gfa
.
readlines
()
lines_gfa
=
file_gfa
.
readlines
()
...
@@ -36,7 +40,7 @@ def get_segments_sequence_and_paths(gfa):
...
@@ -36,7 +40,7 @@ def get_segments_sequence_and_paths(gfa):
[
paths
,
seg_seq
]
=
get_segments_sequence_and_paths
(
gfa
)
[
paths
,
seg_seq
]
=
get_segments_sequence_and_paths
(
gfa
)
segments_on_target_genome
=
{}
segments_on_target_genome
=
{}
pos_seg
=
"
test_data/input_data_inf/genome3_chr10.bed
"
def
get_segments_positions_on_genome
(
pos_seg
):
def
get_segments_positions_on_genome
(
pos_seg
):
bed
=
open
(
pos_seg
,
'
r
'
)
bed
=
open
(
pos_seg
,
'
r
'
)
lines
=
bed
.
readlines
()
# read line by line ?
lines
=
bed
.
readlines
()
# read line by line ?
...
@@ -225,7 +229,7 @@ def get_sequence_on_genome(feature,segments_on_target_genome):
...
@@ -225,7 +229,7 @@ def get_sequence_on_genome(feature,segments_on_target_genome):
return
new_sequence
return
new_sequence
var
=
open
(
"
test_data/variations.txt
"
,
'
r
'
)
lines
=
var
.
readlines
()
lines
=
var
.
readlines
()
var
.
close
()
var
.
close
()
...
@@ -244,52 +248,121 @@ for feature in Features.values():
...
@@ -244,52 +248,121 @@ for feature in Features.values():
add_feature_sequence
(
feature
,
seg_seq
)
add_feature_sequence
(
feature
,
seg_seq
)
def
findOtherStart
(
cds
,
segments_on_target_genome
):
print
(
"
\n
recherche de nouveau codon start:
"
)
frame_shift
=
0
# chercher codon start en amont du cds, dans le mrna
seq_parent
=
get_sequence_on_genome
(
cds
.
parent
,
segments_on_target_genome
)
seq_cds
=
get_sequence_on_genome
(
cds_id
,
segments_on_target_genome
)
sequence_amont
=
seq_parent
[
0
:
seq_parent
.
rfind
(
seq_cds
)]
# get the mrna sequence before the last occurence of the cds sequence
print
(
"
sequence en amont dans le mRNA:
"
,
sequence_amont
)
if
"
ATG
"
in
sequence_amont
:
start_pos_list
=
[
m
.
start
()
for
m
in
re
.
finditer
(
'
(?=ATG)
'
,
sequence_amont
)]
stop_pos_list
=
[
m
.
start
()
for
m
in
re
.
finditer
(
'
(?=TAG|TAA|TGA)
'
,
sequence_amont
)]
print
(
"
position des starts:
"
,
start_pos_list
,
"
position des stops:
"
,
stop_pos_list
,
"
en amont du cds
"
)
# positions (overlapping) où on trouve un atg.
# vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
print
(
"
verification des stops après les starts:
"
)
for
start_pos
in
start_pos_list
:
start_pos_frame
=
start_pos
%
3
n
=
len
(
sequence_amont
)
-
start_pos
+
1
if
True
not
in
(
(
stop_pos
%
3
==
start_pos_frame
)
&
(
stop_pos
>
start_pos
)
for
stop_pos
in
stop_pos_list
)
:
#print("codon start candidat trouvé dans l'arn messager,",n,"bases en amont du cds")
# calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !
frame_shift
=
(
frame_shift
+
n
)
%
3
# vérifier le frame shift !!
print
(
"
le start à la position
"
,
start_pos
,
"
,
"
,
n
,
"
bases en amont du cds, n
'
a pas de stop en aval dans le même cadre de lecture
"
)
else
:
print
(
"
le start à la position
"
,
start_pos
,
"
,
"
,
n
,
"
bases en amont du cds, a un stop en aval dans le même cadre de lecture
"
)
# chercher codon start en aval, dans le cds
if
"
ATG
"
in
seq_cds
:
start_pos_list
=
[
m
.
start
()
for
m
in
re
.
finditer
(
'
(?=ATG)
'
,
seq_cds
)]
print
(
"
codon start candidat trouvé plus loin dans le cds, à la base
"
,
start_pos_list
[
0
])
# print seulement le premier
print
(
"
\n
"
)
version
=
"
new
"
version
=
"
new
"
import
re
import
re
for
cds_id
in
cds_var
.
keys
():
for
cds_id
in
cds_var
.
keys
():
cds
=
Features
[
cds_id
]
cds
=
Features
[
cds_id
]
print
(
"
analysing variations in cds
"
,
cds_id
)
print
(
"
analysing variations in cds
"
,
cds_id
)
frame_shift
=
0
for
var
in
cds_var
[
cds_id
]:
for
var
in
cds_var
[
cds_id
]:
size_var
=
int
(
var
[
11
])
type_var
=
var
[
8
]
type_var
=
var
[
8
]
pos_var
=
int
(
var
[
12
])
-
1
if
type_var
!=
"
no_var
"
:
if
pos_var
<=
3
:
print
(
"
codon start touché
"
)
#chercher start (dans le mrna) avant et apres
# chercher codon start en amont du cds, dans le mrna
seq_parent
=
get_sequence_on_genome
(
cds
.
parent
,
segments_on_target_genome
)
seq_cds
=
get_sequence_on_genome
(
cds_id
,
segments_on_target_genome
)
sequence_amont
=
seq_parent
[
0
:
seq_parent
.
rfind
(
seq_cds
)]
# get the mrna sequence before the last occurence of the cds sequence
print
(
sequence_amont
)
if
"
ATG
"
in
sequence_amont
:
start_pos_list
=
[
m
.
start
()
for
m
in
re
.
finditer
(
'
(?=ATG)
'
,
sequence_amont
)]
stop_pos_list
=
[
m
.
start
()
for
m
in
re
.
finditer
(
'
(?=TAG|TAA|TGA)
'
,
sequence_amont
)]
print
(
start_pos_list
,
stop_pos_list
)
# positions (overlapping) où on trouve un atg.
# vérifier ensuite s'il y a un stop après un atg, et sinon le cadre de lecture de l'atg (peut décaler toute la prot !)
for
start_pos
in
start_pos_list
:
print
(
start_pos
)
start_pos_frame
=
start_pos
%
3
test_list
=
[
39
,
46
,
47
]
if
True
not
in
(
(
stop_pos
%
3
==
start_pos_frame
)
&
(
stop_pos
>
start_pos
)
for
stop_pos
in
stop_pos_list
)
:
n
=
len
(
sequence_amont
)
-
start_pos
print
(
"
codon start candidat trouvé dans l
'
arn messager,
"
,
n
,
"
bases en amont du cds
"
)
# calculer le décalage : si on en trouve un 2 bases en amont, ça décale le cadre de lecture !
frame_shift
=
n
%
3
# mais si on a plusieurs start candidats il faut choisir celui qu'on prend !??
# chercher codon start en aval, dans le cds
if
"
ATG
"
in
seq_cds
:
start_pos_list
=
[
m
.
start
()
for
m
in
re
.
finditer
(
'
(?=ATG)
'
,
seq_cds
)]
for
pos
in
start_pos_list
:
print
(
"
codon start candidat trouvé plus loin dans le cds, à la base
"
,
pos
)
if
"
/
"
in
var
[
11
]:
size_var
=
var
[
11
].
split
(
"
/
"
)
else
:
size_var
=
var
[
11
]
pos_var
=
int
(
var
[
12
])
-
1
if
pos_var
<
3
:
print
(
"
codon start touché
"
)
findOtherStart
(
cds
,
segments_on_target_genome
)
# si on a plusieurs start candidats il faut choisir celui qu'on prend, ils sont pas forcément tous sur le même cadre de lecture
# comment traiter la variation sur le start ?
# après avoir détecté que la var touche le start, traiter la var ?
print
(
var
)
if
type_var
==
"
insertion
"
:
length_ref
=
0
else
:
length_ref
=
len
(
var
[
9
])
if
type_var
==
"
deletion
"
:
length_alt
=
0
else
:
length_alt
=
len
(
var
[
10
])
if
abs
(
length_alt
-
length_ref
)
%
3
==
0
:
# taille diff 3k -> pas de frame shift.
print
(
"
pas de frame shift
"
)
if
pos_var
%
3
==
0
:
# position 3k
print
(
"
variation entre deux codons
"
)
# simplement insertion et/ou deletion.
# délétion éventuelle
# insertion éventuelle
# dépassement éventuel sur le codon d'après si alt ET ref %3!=0.
if
length_ref
%
3
==
0
:
# taille 3K -> pas d'impact sur l'aa en amont
if
length_ref
!=
0
:
print
(
"
deletion de
"
,
var
[
9
])
if
length_alt
!=
0
:
print
(
"
insertion de
"
,
var
[
10
])
else
:
# taille != 3k : impact sur l'aa d'avant (dans lequel on est)
print
(
"
indel puis modification d
'
un autre codon
"
)
# insere+delete+modifie le codon apres
if
length_alt
>
length_ref
:
# subs puis insertion
print
(
"
substitution de
"
,
length_ref
,
"
bases
"
)
insertion
=
var
[
10
][
length_ref
:]
print
(
"
insertion de
"
)
elif
length_ref
>
length_alt
:
# deletion puis subs
print
(
"
substitution de
"
,
length_alt
,
"
bases
"
)
deletion
=
var
[
9
][
0
:
length_alt
]
print
(
"
deletion de
"
)
else
:
# susb
print
(
"
substitution de
"
,
length_ref
,
"
bases
"
)
else
:
# taille 3k, position !=3k
print
(
"
variation au milieu d
'
un codon
"
)
# récupérer le codon dans lequel on a inséré, le prendre en compte pour calculer les effets
else
:
print
(
"
frame shift
"
)
frame_shift
=
(
frame_shift
+
abs
(
length_alt
-
length_ref
))
%
3
# cas en plus : modif de la première base du codon et de la troisième : comment prendre en compte ces deux modif pour le changement de l'aa?
# idée : quand on est sur le premier (ou deuxième??), vérifier le reste du codon avant de regarder le changement d'aa
print
(
"
\n
"
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment