Newer
Older

nina.marthe_ird.fr
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
gfa_path="data/graphs/RiceGraphChr10_cactus.gfa"
gfa_file=open(gfa_path,"r")
lines_gfa=gfa_file.readlines()
gfa_file.close()
gaf_path="graph_chr10.gaf"
gaf_file=open(gaf_path,"r")
lines_gaf=gaf_file.readlines()
gaf_file.close()
gff_path="data/genes/nb_genes_chr10.gff3"
gff_file=open(gff_path,'r')
lines_gff=gff_file.readlines()
gff_file.close()
# make dict for finding start and stop position on ref for the features
feature_pos={}
for line in lines_gff:
line=line.split()
feature_id=line[8].split(";")[0].split("=")[1]
feature_start=line[3]
feature_stop=line[4]
feature_pos[feature_id]=[feature_start,feature_stop]
# create the new W lines for the features' walks in the graph
new_W_lines=""
for line in lines_gaf:
line=line.split()
feature_id=line[0]
if feature_id in feature_pos:
feature_start=feature_pos[feature_id][0]
feature_stop=feature_pos[feature_id][1]
else:
feature_start="."
feature_stop="."
# new_W_lines = W SampleId HapIndex SequenceId start_on_ref stop_on_ref walk
new_W_lines+=f'W\tIRGSP-1_0\t0\t{feature_id}\t{feature_start}\t{feature_stop}\t{line[5]}\n'
gfa_file_out=open("data/RiceGraphChr10_cactus_annotated.gfa","w")
[W_found,W_passed,W_printed]=[False,False,False]
for line in lines_gfa:
if line[0]=="W":
W_found=True
if (line[0]!="W") & W_found:
W_passed=True
if W_passed & (not W_printed):
gfa_file_out.write(new_W_lines)
W_printed=True
gfa_file_out.write(line)
gfa_file_out.close()