Newer
Older
import subprocess

nina.marthe_ird.fr
committed
from Graph_gff import write_line
NMarthe
committed
def has_numbers(inputString):
return any(char.isdigit() for char in inputString)

nina.marthe_ird.fr
committed
def getSeqNumber(name_field):
seq_number=""
if has_numbers(name_field[-3:]):
for char in reversed(name_field): # take the last digits of the field
if not char.isdigit():
break
else:

nina.marthe_ird.fr
committed
seq_number+=char

nina.marthe_ird.fr
committed
for char in reversed(name_field): # take the last uppercase chars of the fied
if not char.isupper():
break
else:

nina.marthe_ird.fr
committed
seq_number+=char
return seq_number[::-1]

nina.marthe_ird.fr
committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def get_seg_len(gfa):
# get the lines that start with "S"
command="grep ^S "+gfa+" > seg_coord/segments.txt"
subprocess.run(command,shell=True,timeout=None)
segments = open('seg_coord/segments.txt','r')
lines=segments.readlines()
segments.close()
# build a dictionnary with the segment sizes
segments_size={}
for line in lines:
line=line.split()
seg_id='s'+line[1]
seg_size=len(line[2])
segments_size[seg_id]=seg_size
return segments_size
def check_walk_name(walk_names,name):
name_found=False
for walk in walk_names:
if walk in name:
name_found=True
break
return name_found # rename name_found var
def seg_coord(gfa,walk_names):
# create directory to store output files
command="mkdir seg_coord"
subprocess.run(command,shell=True)
segments_size=get_seg_len(gfa)
# get the lines that start with "W"
command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)
walks=open('seg_coord/walks.txt','r')
lines=walks.readlines()
walks.close()
# on these lines, get the name of the genome to name the output bed file
file_names=list()
for line in lines :
line=line.split()

nina.marthe_ird.fr
committed
name=line[1]+"_"+line[3]

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
if (check_walk_name(walk_names,name)) | ((len(walk_names)==1) & ("MINIGRAPH" not in name)): # len=1 if there is only the source genome.

nina.marthe_ird.fr
committed
path_start=int(line[4])
seq_name=name.split('_')[-1]
if (seq_name[0:10]=="chromosome") | (seq_name[0:10]=="Chromosome"):
seq_name="Chr"+seq_name[10:]
file_name="seg_coord/"+name+'.bed'
# if we are writing in the file for the first time, overwrite it. else, append it
# this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
if file_name not in file_names:
file_names.append(file_name)
out_bed = open(file_name, 'w')
else :
out_bed = open(file_name, 'a')

nina.marthe_ird.fr
committed
output_bed=[0,"",out_bed]

nina.marthe_ird.fr
committed
path=line[6].split(',')
position=path_start
for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
# coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
seg_start=position
seg_name='s'+path[i][1:]
seg_stop=position+segments_size[seg_name]

nina.marthe_ird.fr
committed

nina.marthe_ird.fr
committed
out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'

nina.marthe_ird.fr
committed
write_line(out_line,output_bed,False)

nina.marthe_ird.fr
committed
position+=segments_size[seg_name]

nina.marthe_ird.fr
committed
write_line("",output_bed,True)

nina.marthe_ird.fr
committed
out_bed.close()

nina.marthe_ird.fr
committed
command="rm seg_coord/segments.txt && rm seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)