Skip to content
Snippets Groups Projects
Commit 6f09e872 authored by nina.marthe_ird.fr's avatar nina.marthe_ird.fr
Browse files

adapted the code to select the genomes to handle, and treat assemblies with...

adapted the code to select the genomes to handle, and treat assemblies with contigs instead of chromosomes
parent 397e4149
No related branches found
No related tags found
No related merge requests found
import subprocess
import sys
def has_numbers(inputString):
return any(char.isdigit() for char in inputString)
def getChrName(chromosome_field):
chromosome_id=""
if has_numbers(chromosome_field):
for char in reversed(chromosome_field): # take the last digits of the field
def getSeqNumber(name_field):
seq_number=""
if has_numbers(name_field[-3:]):
for char in reversed(name_field): # take the last digits of the field
if not char.isdigit():
break
else:
chromosome_id+=char
chromosome_id="Chr"+chromosome_id[::-1]
seq_number+=char
else:
for char in reversed(chromosome_field): # take the last uppercase chars of the fied
for char in reversed(name_field): # take the last uppercase chars of the fied
if not char.isupper():
break
else:
chromosome_id+=char
chromosome_id="Chr"+chromosome_id[::-1]
return chromosome_id
if not(len(sys.argv)==2) :
print("expected input : gfa file with walks.")
print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).")
sys.exit(1)
elif (sys.argv[1]=="-h") :
print("expected input : gfa file with walks.")
print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).")
sys.exit(1)
gfa=sys.argv[1]
# get the lines that start with "S"
command="grep ^S "+gfa+" > segments.txt"
subprocess.run(command,shell=True)
segments = open('segments.txt','r')
lines=segments.readlines()
segments.close()
# build a dictionnary with the segment sizes
segments_size={}
for line in lines:
line=line.split()
seg_id='s'+line[1]
seg_size=len(line[2])
segments_size[seg_id]=seg_size
# get the lines that start with "W"
command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > walks.txt"
subprocess.run(command,shell=True)
walks=open('walks.txt','r')
lines=walks.readlines()
walks.close()
# on these lines, get the name of the genome to name the output bed file
file_names=list()
for line in lines :
line=line.split()
name=line[3]
path_start=int(line[4])
chromosome_field=line[3]
chromosome_id=getChrName(chromosome_field)
file_name=name+'.bed'
# if we are writing in the file for the first time, overwrite it. else, append it
# this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
if file_name not in file_names:
file_names.append(file_name)
out_bed = open(file_name, 'w')
else :
out_bed = open(file_name, 'a')
path=line[6].split(',')
position=path_start
seq_number+=char
return seq_number[::-1]
for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
# coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size
chr='Chr'+chromosome_id[len(chromosome_id)-2:]
seg_start=position
seg_name='s'+path[i][1:]
seg_stop=position+segments_size[seg_name]
if path[i][0:1]=='>':
orientation='+'
elif path[i][0:1]=='<':
orientation='-'
else:
orientation='.'
def get_seg_len(gfa):
# get the lines that start with "S"
command="grep ^S "+gfa+" > seg_coord/segments.txt"
subprocess.run(command,shell=True,timeout=None)
segments = open('seg_coord/segments.txt','r')
lines=segments.readlines()
segments.close()
# build a dictionnary with the segment sizes
segments_size={}
for line in lines:
line=line.split()
seg_id='s'+line[1]
seg_size=len(line[2])
segments_size[seg_id]=seg_size
return segments_size
def check_walk_name(walk_names,name):
name_found=False
for walk in walk_names:
if walk in name:
name_found=True
break
return name_found # rename name_found var
def seg_coord(gfa,walk_names):
# create directory to store output files
command="mkdir seg_coord"
subprocess.run(command,shell=True)
segments_size=get_seg_len(gfa)
# get the lines that start with "W"
command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)
walks=open('seg_coord/walks.txt','r')
lines=walks.readlines()
walks.close()
# on these lines, get the name of the genome to name the output bed file
file_names=list()
for line in lines :
line=line.split()
name=line[3]
if check_walk_name(walk_names,name) | (len(walk_names)==1): # len=1 if there is only the source genome.
path_start=int(line[4])
seq_name=name.split('_')[-1]
if (seq_name[0:10]=="chromosome") | (seq_name[0:10]=="Chromosome"):
seq_name="Chr"+seq_name[10:]
file_name="seg_coord/"+name+'.bed'
# if we are writing in the file for the first time, overwrite it. else, append it
# this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
if file_name not in file_names:
file_names.append(file_name)
out_bed = open(file_name, 'w')
else :
out_bed = open(file_name, 'a')
path=line[6].split(',')
position=path_start
for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
# coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size
seg_start=position
seg_name='s'+path[i][1:]
seg_stop=position+segments_size[seg_name]
out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
out_bed.write(out_line)
position+=segments_size[seg_name]
out_bed.close()
out_line=chr+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
out_bed.write(out_line)
position+=segments_size[seg_name]
out_bed.close()
command="rm segments.txt && rm walks.txt"
subprocess.run(command,shell=True)
command="if [ 0 -lt $(ls _MINIGRAPH_* 2>/dev/null | wc -w) ]; then mkdir minigraph_segments && mv _MINIGRAPH_* minigraph_segments; fi"
subprocess.run(command,shell=True)
\ No newline at end of file
command="rm seg_coord/segments.txt && rm seg_coord/walks.txt"
subprocess.run(command,shell=True,timeout=None)
command="if ls test/*_MINIGRAPH_* 1> /dev/null 2>&1; then mkdir seg_coord/minigraph_segments && mv *_MINIGRAPH_* seg_coord/minigraph_segments/; fi"
subprocess.run(command,shell=True,timeout=None)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment