Newer
Older

nina.marthe_ird.fr
committed
from tqdm import tqdm
# get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..?

nina.marthe_ird.fr
committed
def get_genome_name(target_genomes,file_name,haplo):
if haplo:
genome_name=file_name.split("#")[0]+"#"+file_name.split("#")[1]
else:
genome_name=file_name.split("#")[0]

nina.marthe_ird.fr
committed
for genome in target_genomes:

nina.marthe_ird.fr
committed
if genome == genome_name:

nina.marthe_ird.fr
committed
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
return genome
return ""
# create a dictionnary with the length of the segments : s5->8
def get_segments_length(segments,verbose):
seg_len={}
total_lines=0
if verbose:
total_lines=len(["" for line in open(segments,"r")])
with open(segments,'r') as segments_file:
for line in tqdm(segments_file, desc="Computing the lengths of the segments", total=total_lines, unit=" segment", disable=not verbose):
line=line.split()
seg_id=line[1]
seg_len[seg_id]=len(line[2])
return seg_len
# generates a dictionnary that associaces the segments to their sequence : 5->AGGCTAA
def get_segments_sequence(segments_file,segments_list):
with open(segments_file,'r') as file_segments:
seg_seq={}
for line in file_segments:
line=line.split()
seg_id=line[1]
if seg_id in segments_list:
seg_seq[seg_id]=line[2]
return seg_seq
def get_target_genomes(args):
# get the target genomes if there is none specified
if len(args.target)==0:
walk_path=args.segment_coordinates_path.joinpath("walks.txt")
total_lines = len(["" for line in open(walk_path,"r")])
with open(walk_path,'r') as walks:
for line in tqdm(walks,desc="Fetching all the genomes in the graph for the transfer",total=total_lines,unit=" line",disable=not args.verbose):
genome_name=line.split()[1]
if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name):
args.target.append(genome_name)
if args.verbose:
print(f" Genomes found : {args.target}")
# get haplotypes for each target genome
if args.haplotype:
target_set=set()
for genome in args.target:
bed_files=list(args.segment_coordinates_path.glob(f"*{genome}*.bed"))
for haplotype in bed_files:
haplotype_split=haplotype.name.split("#")
target_set.add(haplotype_split[0]+"#"+haplotype_split[1])
args.target=list(target_set)
# generates a dictionnary that associates a walk_name to a list of segments : chr10->[>s1,>s2,>s4]
def get_paths(walks_file,target_genome,haplotype):
with open(walks_file,'r') as file_walks:
paths={}
for line in file_walks:
line=line.split()
if haplotype:

nina.marthe_ird.fr
committed
genome_name=line[1]+"#"+line[2]

nina.marthe_ird.fr
committed
seq_name=line[1]+"#"+line[2]+"#"+line[3]
else:

nina.marthe_ird.fr
committed
genome_name=line[1]

nina.marthe_ird.fr
committed
seq_name=line[1]+'#'+line[3]

nina.marthe_ird.fr
committed
if target_genome == genome_name: # get the walk of the genome

nina.marthe_ird.fr
committed
path=line[6].split(',')[1:]
list_segments=[]
for segment in path:
list_segments.append(segment)
if seq_name in paths:
paths[seq_name].extend(list_segments)
else:
paths[seq_name]=list_segments
return paths
# appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
with open(pos_seg,'r') as bed:
seg_count=0
file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
for line in bed:
line=line.split()
[seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
# check if segment present twice on the same walk ???
#segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
if seg not in segments_on_target_genome:
segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
if file_name not in segments_on_target_genome[seg]:
segments_on_target_genome[seg][file_name]=list()
segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
seg_count+=1