Newer
Older
import subprocess
NMarthe
committed
def has_numbers(inputString):
return any(char.isdigit() for char in inputString)

nina.marthe_ird.fr
committed
def getSeqNumber(name_field):
seq_number=""
if has_numbers(name_field[-3:]):
for char in reversed(name_field): # take the last digits of the field
if not char.isdigit():
break
else:

nina.marthe_ird.fr
committed
seq_number+=char

nina.marthe_ird.fr
committed
for char in reversed(name_field): # take the last uppercase chars of the fied
if not char.isupper():
break
else:

nina.marthe_ird.fr
committed
seq_number+=char
return seq_number[::-1]

nina.marthe_ird.fr
committed
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
def get_seg_len(gfa):
# get the lines that start with "S"
command="grep ^S "+gfa+" > seg_coord/segments.txt"
subprocess.run(command,shell=True,timeout=None)
segments = open('seg_coord/segments.txt','r')
lines=segments.readlines()
segments.close()
# build a dictionnary with the segment sizes
segments_size={}
for line in lines:
line=line.split()
seg_id='s'+line[1]
seg_size=len(line[2])
segments_size[seg_id]=seg_size
return segments_size
def check_walk_name(walk_names,name):
name_found=False
for walk in walk_names:
if walk in name:
name_found=True
break
return name_found # rename name_found var
def seg_coord(gfa,walk_names):
# create directory to store output files
command="mkdir seg_coord"
subprocess.run(command,shell=True)
segments_size=get_seg_len(gfa)
# get the lines that start with "W"
command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)
walks=open('seg_coord/walks.txt','r')
lines=walks.readlines()
walks.close()
# on these lines, get the name of the genome to name the output bed file
file_names=list()
for line in lines :
line=line.split()
name=line[3]
if check_walk_name(walk_names,name) | (len(walk_names)==1): # len=1 if there is only the source genome.
path_start=int(line[4])
seq_name=name.split('_')[-1]
if (seq_name[0:10]=="chromosome") | (seq_name[0:10]=="Chromosome"):
seq_name="Chr"+seq_name[10:]
file_name="seg_coord/"+name+'.bed'
# if we are writing in the file for the first time, overwrite it. else, append it
# this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
if file_name not in file_names:
file_names.append(file_name)
out_bed = open(file_name, 'w')
else :
out_bed = open(file_name, 'a')
path=line[6].split(',')
position=path_start
for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
# coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size
seg_start=position
seg_name='s'+path[i][1:]
seg_stop=position+segments_size[seg_name]
out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
out_bed.write(out_line)
position+=segments_size[seg_name]
out_bed.close()

nina.marthe_ird.fr
committed
command="rm seg_coord/segments.txt && rm seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)
command="if ls test/*_MINIGRAPH_* 1> /dev/null 2>&1; then mkdir seg_coord/minigraph_segments && mv *_MINIGRAPH_* seg_coord/minigraph_segments/; fi"
subprocess.run(command,shell=True,timeout=None)