Newer
Older

nina.marthe_ird.fr
committed
from tqdm import tqdm
# get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..?
def get_genome_name(target_genomes,file_name):

nina.marthe_ird.fr
committed
genome_name=file_name.split("#")[0]

nina.marthe_ird.fr
committed
for genome in target_genomes:

nina.marthe_ird.fr
committed
if genome == genome_name:

nina.marthe_ird.fr
committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
return genome
return ""
# create a dictionnary with the length of the segments : s5->8
def get_segments_length(segments,verbose):
seg_len={}
total_lines=0
if verbose:
total_lines=len(["" for line in open(segments,"r")])
with open(segments,'r') as segments_file:
for line in tqdm(segments_file, desc="Computing the lengths of the segments", total=total_lines, unit=" segment", disable=not verbose):
line=line.split()
seg_id=line[1]
seg_len[seg_id]=len(line[2])
return seg_len
# generates a dictionnary that associaces the segments to their sequence : 5->AGGCTAA
def get_segments_sequence(segments_file,segments_list):
with open(segments_file,'r') as file_segments:
seg_seq={}
for line in file_segments:
line=line.split()
seg_id=line[1]
if seg_id in segments_list:
seg_seq[seg_id]=line[2]
return seg_seq
def get_target_genomes(args):
# get the target genomes if there is none specified
if len(args.target)==0:
walk_path=args.segment_coordinates_path.joinpath("walks.txt")
total_lines = len(["" for line in open(walk_path,"r")])
with open(walk_path,'r') as walks:
for line in tqdm(walks,desc="Fetching all the genomes in the graph for the transfer",total=total_lines,unit=" line",disable=not args.verbose):
genome_name=line.split()[1]
if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name):
args.target.append(genome_name)
if args.verbose:
print(f" Genomes found : {args.target}")
# get haplotypes for each target genome
if args.haplotype:
target_set=set()
for genome in args.target:
bed_files=list(args.segment_coordinates_path.glob(f"*{genome}*.bed"))
for haplotype in bed_files:
haplotype_split=haplotype.name.split("#")
target_set.add(haplotype_split[0]+"#"+haplotype_split[1])
args.target=list(target_set)
# generates a dictionnary that associates a walk_name to a list of segments : chr10->[>s1,>s2,>s4]
def get_paths(walks_file,target_genome,haplotype):
with open(walks_file,'r') as file_walks:
paths={}
for line in file_walks:
line=line.split()
if haplotype:
seq_name=line[1]+"#"+line[2]+"#"+line[3]
else:

nina.marthe_ird.fr
committed
seq_name=line[1]+'#'+line[3]
if target_genome == line[1]: # get the walk of the genome

nina.marthe_ird.fr
committed
path=line[6].split(',')[1:]
list_segments=[]
for segment in path:
list_segments.append(segment)
if seq_name in paths:
paths[seq_name].extend(list_segments)
else:
paths[seq_name]=list_segments
return paths
# appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
with open(pos_seg,'r') as bed:
seg_count=0
file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
for line in bed:
line=line.split()
[seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
# check if segment present twice on the same walk ???
#segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
if seg not in segments_on_target_genome:
segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
if file_name not in segments_on_target_genome[seg]:
segments_on_target_genome[seg][file_name]=list()
segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
seg_count+=1