from tqdm import tqdm import subprocess import sys # get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..? def get_genome_name(target_genomes,file_name): for genome in target_genomes: if genome in file_name: return genome return "" # create a dictionnary with the length of the segments : s5->8 def get_segments_length(segments,verbose): seg_len={} total_lines=0 if verbose: total_lines=len(["" for line in open(segments,"r")]) with open(segments,'r') as segments_file: for line in tqdm(segments_file, desc="Computing the lengths of the segments", total=total_lines, unit=" segment", disable=not verbose): line=line.split() seg_id=line[1] seg_len[seg_id]=len(line[2]) return seg_len def get_target_genomes(args): # get the target genomes if there is none specified if len(args.target)==0: walk_path=args.segment_coordinates_path.joinpath("walks.txt") total_lines = len(["" for line in open(walk_path,"r")]) with open(walk_path,'r') as walks: for line in tqdm(walks,desc="Fetching all the genomes in the graph for the transfer",total=total_lines,unit=" line",disable=not args.verbose): genome_name=line.split()[1] if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name): args.target.append(genome_name) if args.verbose: print(f" Genomes found : {args.target}") # get haplotypes for each target genome if args.haplotype: target_set=set() for genome in args.target: bed_files=list(args.segment_coordinates_path.glob(f"*{genome}*.bed")) for haplotype in bed_files: haplotype_split=haplotype.name.split("#") target_set.add(haplotype_split[0]+"#"+haplotype_split[1]) args.target=list(target_set) def run_intersect(args): print("Computing the intersection between the annotations and the graph segments") seg_coord_path=args.segment_coordinates_path if args.haplotype: if args.source_haplotype=='.': files=list(seg_coord_path.glob(f"*{args.source_genome}*.bed")) # get all bed files from the source genome in seg_coord # select haplotype source_haplotypes=[] for file in files: file_haplotype=file.name.split("#")[1] source_haplotypes.append(file_haplotype) first_source_haplo=min(source_haplotypes) args.source_haplotype=first_source_haplo haplo=f"#{first_source_haplo}#" print('\033[35m'+f'Warning : No source haplotype was given with \'-sh\'. Haplotype {first_source_haplo} will be used.'+'\033[0m') files=list(seg_coord_path.glob(f"*{args.source_genome}*{haplo}*.bed")) # get all bed files from the source genome haplotype in seg_coord message=f'No bed files corresponding to the source genome haplotype {first_source_haplo} found in {seg_coord_path}' else: haplo = f"#{args.source_haplotype}#" files=list(seg_coord_path.glob(f"*{args.source_genome}*{haplo}*.bed")) # get all bed files from the source genome in seg_coord message=f'No bed files corresponding to the source genome haplotype found in {seg_coord_path}' if len(files)==0: sys.exit(message) if args.verbose: print(f' Found {len(files)} paths corresponding to the source genome haplotype {args.source_haplotype}') else: files=list(seg_coord_path.glob(f"*{args.source_genome}*.bed")) # get all bed files from the source genome in seg_coord message='\33[31m'+f'Error : No bed files corresponding to the source genome found in {seg_coord_path}. Please check that the source genome name is right.'+'\033[0m' if len(files)==0: sys.exit(message) if args.verbose: print(f' Found {len(files)} paths corresponding to the source genome') intersect_path=args.outdir.joinpath("intersect") command=f"echo -n '' > {intersect_path}" # empty the file "intersect" subprocess.run(command,shell=True,timeout=None) for file in files: if args.verbose: print(f' Building the intersection for the path {file.stem}') command=f"bedtools intersect -nonamecheck -wo -a {file.as_posix()} -b {args.gff.as_posix()}>>{intersect_path}" subprocess.run(command,shell=True,timeout=None) intersect_lines = int(subprocess.Popen(f"wc -l {intersect_path}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()[0]) if intersect_lines==0: print('\33[31m'+"Error : No lines in the intersect. Please check that the sequence id in the GFF file (1st field) matches the sequence id in the GFA file (4th field of the W lines or 3rd part of the second field of the P lines)."+'\033[0m') exit()