from tqdm import tqdm # get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..? def get_genome_name(target_genomes,file_name): for genome in target_genomes: if genome in file_name: return genome return "" # create a dictionnary with the length of the segments : s5->8 def get_segments_length(segments,verbose): seg_len={} total_lines=0 if verbose: total_lines=len(["" for line in open(segments,"r")]) with open(segments,'r') as segments_file: for line in tqdm(segments_file, desc="Computing the lengths of the segments", total=total_lines, unit=" segment", disable=not verbose): line=line.split() seg_id=line[1] seg_len[seg_id]=len(line[2]) return seg_len # generates a dictionnary that associaces the segments to their sequence : 5->AGGCTAA def get_segments_sequence(segments_file,segments_list): with open(segments_file,'r') as file_segments: seg_seq={} for line in file_segments: line=line.split() seg_id=line[1] if seg_id in segments_list: seg_seq[seg_id]=line[2] return seg_seq def get_target_genomes(args): # get the target genomes if there is none specified if len(args.target)==0: walk_path=args.segment_coordinates_path.joinpath("walks.txt") total_lines = len(["" for line in open(walk_path,"r")]) with open(walk_path,'r') as walks: for line in tqdm(walks,desc="Fetching all the genomes in the graph for the transfer",total=total_lines,unit=" line",disable=not args.verbose): genome_name=line.split()[1] if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name): args.target.append(genome_name) if args.verbose: print(f" Genomes found : {args.target}") # get haplotypes for each target genome if args.haplotype: target_set=set() for genome in args.target: bed_files=list(args.segment_coordinates_path.glob(f"*{genome}*.bed")) for haplotype in bed_files: haplotype_split=haplotype.name.split("#") target_set.add(haplotype_split[0]+"#"+haplotype_split[1]) args.target=list(target_set) # generates a dictionnary that associates a walk_name to a list of segments : chr10->[>s1,>s2,>s4] def get_paths(walks_file,target_genome,haplotype): with open(walks_file,'r') as file_walks: paths={} for line in file_walks: line=line.split() if haplotype: seq_name=line[1]+"#"+line[2]+"#"+line[3] else: seq_name=line[1]+'_'+line[3] if target_genome in seq_name: # get the walk of the genome path=line[6].split(',')[1:] list_segments=[] for segment in path: list_segments.append(segment) if seq_name in paths: paths[seq_name].extend(list_segments) else: paths[seq_name]=list_segments return paths # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list) def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments. with open(pos_seg,'r') as bed: seg_count=0 file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename for line in bed: line=line.split() [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system # check if segment present twice on the same walk ??? #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name] if seg not in segments_on_target_genome: segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk if file_name not in segments_on_target_genome[seg]: segments_on_target_genome[seg][file_name]=list() segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index]) seg_count+=1