Skip to content
Snippets Groups Projects
load_gfa.py 4.47 KiB
Newer Older
  • Learn to ignore specific revisions
  • from tqdm import tqdm
    
    # get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..?
    def get_genome_name(target_genomes,file_name):
        for genome in target_genomes:
            if genome in file_name:
                return genome
        return ""
    
    # create a dictionnary with the length of the segments : s5->8
    def get_segments_length(segments,verbose):
        seg_len={}
        total_lines=0
        if verbose:
            total_lines=len(["" for line in open(segments,"r")])
        with open(segments,'r') as segments_file:
            for line in tqdm(segments_file, desc="Computing the lengths of the segments", total=total_lines, unit=" segment", disable=not verbose):
                line=line.split()
                seg_id=line[1]
                seg_len[seg_id]=len(line[2])
            return seg_len
        
    # generates a dictionnary that associaces the segments to their sequence : 5->AGGCTAA
    def get_segments_sequence(segments_file,segments_list):
        with open(segments_file,'r') as file_segments:
            seg_seq={}
            for line in file_segments:
                line=line.split()
                seg_id=line[1]
                if seg_id in segments_list:
                    seg_seq[seg_id]=line[2]
        return seg_seq
    
    def get_target_genomes(args):
        # get the target genomes if there is none specified
        if len(args.target)==0:
            walk_path=args.segment_coordinates_path.joinpath("walks.txt")
            total_lines = len(["" for line in open(walk_path,"r")])
            with open(walk_path,'r') as walks:
                for line in tqdm(walks,desc="Fetching all the genomes in the graph for the transfer",total=total_lines,unit=" line",disable=not args.verbose):
                    genome_name=line.split()[1]
                    if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name):
                        args.target.append(genome_name)
                if args.verbose:
                    print(f"     Genomes found : {args.target}")
    
        # get haplotypes for each target genome
        if args.haplotype:
            target_set=set()
            for genome in args.target:
                bed_files=list(args.segment_coordinates_path.glob(f"*{genome}*.bed"))
                for haplotype in bed_files:
                    haplotype_split=haplotype.name.split("#")
                    target_set.add(haplotype_split[0]+"#"+haplotype_split[1])
            args.target=list(target_set)
    
    # generates a dictionnary that associates a walk_name to a list of segments : chr10->[>s1,>s2,>s4]
    def get_paths(walks_file,target_genome,haplotype):
        with open(walks_file,'r') as file_walks:
            paths={}
            for line in file_walks:
                line=line.split()
                if haplotype:
                    seq_name=line[1]+"#"+line[2]+"#"+line[3]
                else:
                    seq_name=line[1]+'_'+line[3]
                if target_genome in seq_name: # get the walk of the genome
                    path=line[6].split(',')[1:]
                    list_segments=[]
                    for segment in path:
                        list_segments.append(segment)
                    if seq_name in paths:
                        paths[seq_name].extend(list_segments)
                    else:
                        paths[seq_name]=list_segments
        return paths
    
    # appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
    def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
        with open(pos_seg,'r') as bed:
            seg_count=0
            file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
            for line in bed:
                line=line.split()
                [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
                # check if segment present twice on the same walk ???
                #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
                if seg not in segments_on_target_genome:
                    segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
                if file_name not in segments_on_target_genome[seg]:
                    segments_on_target_genome[seg][file_name]=list()
                segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
                seg_count+=1