load_gfa.py

from tqdm import tqdm

# get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..?
def get_genome_name(target_genomes,file_name):
    genome_name=file_name.split("#")[0]
    for genome in target_genomes:
        if genome == genome_name:
            return genome
    return ""

# create a dictionnary with the length of the segments : s5->8
def get_segments_length(segments,verbose):
    seg_len={}
    total_lines=0
    if verbose:
        total_lines=len(["" for line in open(segments,"r")])
    with open(segments,'r') as segments_file:
        for line in tqdm(segments_file, desc="Computing the lengths of the segments", total=total_lines, unit=" segment", disable=not verbose):
            line=line.split()
            seg_id=line[1]
            seg_len[seg_id]=len(line[2])
        return seg_len
    
# generates a dictionnary that associaces the segments to their sequence : 5->AGGCTAA
def get_segments_sequence(segments_file,segments_list):
    with open(segments_file,'r') as file_segments:
        seg_seq={}
        for line in file_segments:
            line=line.split()
            seg_id=line[1]
            if seg_id in segments_list:
                seg_seq[seg_id]=line[2]
    return seg_seq

def get_target_genomes(args):
    # get the target genomes if there is none specified
    if len(args.target)==0:
        walk_path=args.segment_coordinates_path.joinpath("walks.txt")
        total_lines = len(["" for line in open(walk_path,"r")])
        with open(walk_path,'r') as walks:
            for line in tqdm(walks,desc="Fetching all the genomes in the graph for the transfer",total=total_lines,unit=" line",disable=not args.verbose):
                genome_name=line.split()[1]
                if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name):
                    args.target.append(genome_name)
            if args.verbose:
                print(f"     Genomes found : {args.target}")

    # get haplotypes for each target genome
    if args.haplotype:
        target_set=set()
        for genome in args.target:
            bed_files=list(args.segment_coordinates_path.glob(f"*{genome}*.bed"))
            for haplotype in bed_files:
                haplotype_split=haplotype.name.split("#")
                target_set.add(haplotype_split[0]+"#"+haplotype_split[1])
        args.target=list(target_set)

# generates a dictionnary that associates a walk_name to a list of segments : chr10->[>s1,>s2,>s4]
def get_paths(walks_file,target_genome,haplotype):
    with open(walks_file,'r') as file_walks:
        paths={}
        for line in file_walks:
            line=line.split()
            if haplotype:
                seq_name=line[1]+"#"+line[2]+"#"+line[3]
            else:
                seq_name=line[1]+'#'+line[3]
            if target_genome == line[1]: # get the walk of the genome
                path=line[6].split(',')[1:]
                list_segments=[]
                for segment in path:
                    list_segments.append(segment)
                if seq_name in paths:
                    paths[seq_name].extend(list_segments)
                else:
                    paths[seq_name]=list_segments
    return paths

# appends a dictionnary that associates a segments with its position on all the walks it's on (start stop and index in the segment list)
def get_segments_positions_on_genome(pos_seg,segments_on_target_genome): # add to the dict the info about the segments.
    with open(pos_seg,'r') as bed:
        seg_count=0
        file_name='.'.join(pos_seg.split('/')[-1].split('.')[0:-1]) # split by '.' to get the filename without the extention, then join by '.' in case there is a '.' in the filename
        for line in bed:
            line=line.split()
            [seg,chrom,start,stop,strand,index]=[line[3],line[0],int(line[1])+1,int(line[2]),line[3][0:1],seg_count] # +1 in the start to convert the bed 0-based coordinate to a 1-based system
            # check if segment present twice on the same walk ???
            #segments_on_target_genome[seg]=[chrom,start,stop,strand,index,file_name]
            if seg not in segments_on_target_genome:
                segments_on_target_genome[seg]={} # dict of walks->segment_info; you get the info about the segment for each walk
            if file_name not in segments_on_target_genome[seg]:
                segments_on_target_genome[seg][file_name]=list()
            segments_on_target_genome[seg][file_name].append([chrom,start,stop,strand,index])
            seg_count+=1