Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from tqdm import tqdm
import subprocess
import sys
# get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..?
def get_genome_name(target_genomes,file_name):
for genome in target_genomes:
if genome in file_name:
return genome
return ""
# create a dictionnary with the length of the segments : s5->8
def get_segments_length(segments,verbose):
seg_len={}
total_lines=0
if verbose:
total_lines=len(["" for line in open(segments,"r")])
with open(segments,'r') as segments_file:
for line in tqdm(segments_file, desc="Computing the lengths of the segments", total=total_lines, unit=" segment", disable=not verbose):
line=line.split()
seg_id=line[1]
seg_len[seg_id]=len(line[2])
return seg_len
def get_target_genomes(args):
# get the target genomes if there is none specified
if len(args.target)==0:
walk_path=args.segment_coordinates_path.joinpath("walks.txt")
total_lines = len(["" for line in open(walk_path,"r")])
with open(walk_path,'r') as walks:
for line in tqdm(walks,desc="Fetching all the genomes in the graph for the transfer",total=total_lines,unit=" line",disable=not args.verbose):
genome_name=line.split()[1]
if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name):
args.target.append(genome_name)
if args.verbose:
print(f" Genomes found : {args.target}")
# get haplotypes for each target genome
if args.haplotype:
target_set=set()
for genome in args.target:
bed_files=list(args.segment_coordinates_path.glob(f"*{genome}*.bed"))
for haplotype in bed_files:
haplotype_split=haplotype.name.split("#")
target_set.add(haplotype_split[0]+"#"+haplotype_split[1])
args.target=list(target_set)
def run_intersect(args):
print("Computing the intersection between the annotations and the graph segments")
seg_coord_path=args.segment_coordinates_path
if args.haplotype:
if args.source_haplotype=='.':
files=list(seg_coord_path.glob(f"*{args.source_genome}*.bed")) # get all bed files from the source genome in seg_coord
# select haplotype
source_haplotypes=[]
for file in files:
file_haplotype=file.name.split("#")[1]
source_haplotypes.append(file_haplotype)
first_source_haplo=min(source_haplotypes)
args.source_haplotype=first_source_haplo
haplo=f"#{first_source_haplo}#"
print('\033[35m'+f'Warning : No source haplotype was given with \'-sh\'. Haplotype {first_source_haplo} will be used.'+'\033[0m')
files=list(seg_coord_path.glob(f"*{args.source_genome}*{haplo}*.bed")) # get all bed files from the source genome haplotype in seg_coord
message=f'No bed files corresponding to the source genome haplotype {first_source_haplo} found in {seg_coord_path}'
else:
haplo = f"#{args.source_haplotype}#"
files=list(seg_coord_path.glob(f"*{args.source_genome}*{haplo}*.bed")) # get all bed files from the source genome in seg_coord
message=f'No bed files corresponding to the source genome haplotype found in {seg_coord_path}'
if len(files)==0:
sys.exit(message)
if args.verbose:
print(f' Found {len(files)} paths corresponding to the source genome haplotype {args.source_haplotype}')
else:
files=list(seg_coord_path.glob(f"*{args.source_genome}*.bed")) # get all bed files from the source genome in seg_coord
message='\33[31m'+f'Error : No bed files corresponding to the source genome found in {seg_coord_path}. Please check that the source genome name is right.'+'\033[0m'
if len(files)==0:
sys.exit(message)
if args.verbose:
print(f' Found {len(files)} paths corresponding to the source genome')
intersect_path=args.outdir.joinpath("intersect")
command=f"echo -n '' > {intersect_path}" # empty the file "intersect"
subprocess.run(command,shell=True,timeout=None)
for file in files:
if args.verbose:
print(f' Building the intersection for the path {file.stem}')
command=f"bedtools intersect -nonamecheck -wo -a {file.as_posix()} -b {args.gff.as_posix()}>>{intersect_path}"
subprocess.run(command,shell=True,timeout=None)
intersect_lines = int(subprocess.Popen(f"wc -l {intersect_path}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()[0])
if intersect_lines==0:
print('\33[31m'+"Error : No lines in the intersect. Please check that the sequence id in the GFF file (1st field) matches the sequence id in the GFA file (4th field of the W lines or 3rd part of the second field of the P lines)."+'\033[0m')
exit()