improved how paths and walks are detected and how bed files are attributed to...

improved how paths and walks are detected and how bed files are attributed to genomes; fixes bug when a genome name was included in another genome name

improved how paths and walks are detected and how bed files are attributed to...
improved how paths and walks are detected and how bed files are attributed to genomes; fixes bug when a genome name was included in another genome name
dd08d372 · nina.marthe_ird.fr · e822f3ab · dd08d372 · dd08d372 · dd08d372
Commit dd08d372 authored 1 month ago by nina.marthe_ird.fr
--- a/GrAnnoT/getSegmentsCoordinates.py
+++ b/GrAnnoT/getSegmentsCoordinates.py
@@ -43,45 +43,45 @@ def get_walks_paths(gfa,seg_coord_path,verbose):
    command=f"grep ^W {gfa} | sed 's/>/,>/g' | sed 's/</,</g' > {seg_coord_path.joinpath('walks.txt')}"
    subprocess.run(command,shell=True,timeout=None)
    w_lines =  subprocess.Popen(f"wc -l {seg_coord_path.joinpath('walks.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
-    if w_lines[0]=='0':
-        command=f"grep ^P {gfa} > {seg_coord_path.joinpath('paths.txt')}"
+
+    command=f"grep ^P {gfa} > {seg_coord_path.joinpath('paths.txt')}"
+    subprocess.run(command,shell=True,timeout=None)
+    p_lines =  subprocess.Popen(f"wc -l {seg_coord_path.joinpath('paths.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
+
+    if (p_lines[0]=='0') and (w_lines[0]=='0'):
+        command=f"rm {seg_coord_path.joinpath('walks.txt')} && rm {seg_coord_path.joinpath('paths.txt')}"
        subprocess.run(command,shell=True,timeout=None)
-        p_lines =  subprocess.Popen(f"wc -l {seg_coord_path.joinpath('paths.txt')}", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
-
-        if p_lines[0]=='0':
-            command=f"rm {seg_coord_path.joinpath('walks.txt')} && rm {seg_coord_path.joinpath('paths.txt')}"
-            subprocess.run(command,shell=True,timeout=None)
-            sys.exit("No walks or paths detected in the GFA file")
-        else:
-            path_path=seg_coord_path.joinpath("paths.txt")
-            total_lines=0
-            if verbose:
-                total_lines = len(["" for line in open(path_path,"r")])
-            with open(path_path,"r") as path_file, open(seg_coord_path.joinpath("walks.txt"),'w') as walk_file:
-                for line in tqdm(path_file, desc="     Converting paths in walks", total=total_lines, unit=" path", disable=not verbose):
-                    path_line=line.split()
-                    name=path_line[1].split("#") #   assembly#hapl#chr # todo
-                    [asm,hap,seq]=name
-                    # seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful.
-                    if ',' in path_line[2]:
-                        path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ;
+        sys.exit("No walks or paths detected in the GFA file")
+    elif p_lines[0]!='0':
+        path_path=seg_coord_path.joinpath("paths.txt")
+        total_lines=0
+        if verbose:
+            total_lines = len(["" for line in open(path_path,"r")])
+        with open(path_path,"r") as path_file, open(seg_coord_path.joinpath("walks.txt"),'a') as walk_file:
+            for line in tqdm(path_file, desc="     Converting paths in walks", total=total_lines, unit=" path", disable=not verbose):
+                path_line=line.split()
+                name=path_line[1].split("#") # assembly#hapl#chr
+                [asm,hap,seq]=name
+                # seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful.
+                if ',' in path_line[2]:
+                    path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ;
+                else:
+                    path=path_line[2].split(';')
+                # error message if no , or ; but if there is only one segment there is no separator......
+                # expect no overlap !
+                line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t'
+                for seg in path:
+                    if seg[-1]=="-":
+                        strand="<"
                    else:
-                        path=path_line[2].split(';')
-                    # error message if no , or ; but if there is only one segment there is no separator......
-                    # expect no overlap !
-                    line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t'
-                    for seg in path:
-                        if seg[-1]=="-":
-                            strand="<"
-                        else:
-                            strand=">"
-                        seg_stranded=strand+seg[:-1]
-                        line+=','
-                        line+=seg_stranded
-                    line+="\n"
-                    walk_file.write(line)
-            command=f"rm {seg_coord_path.joinpath('paths.txt')}"
-            subprocess.run(command,shell=True,timeout=None)
+                        strand=">"
+                    seg_stranded=strand+seg[:-1]
+                    line+=','
+                    line+=seg_stranded
+                line+="\n"
+                walk_file.write(line)
+    command=f"rm {seg_coord_path.joinpath('paths.txt')}"
+    subprocess.run(command,shell=True,timeout=None)
                        
    walk_path=seg_coord_path.joinpath("walks.txt")
    return walk_path
@@ -116,7 +116,7 @@ def generate_bed(line,walk_names,segments_size,file_names,seg_coord_path,transfe
    if haplotype:
        name=line[1]+"#"+line[2]+"#"+line[3]
    else:
-        name=line[1]+'_'+line[3]
+        name=line[1]+'#'+line[3]

    if (check_walk_name(walk_names,name)) or ((len(walk_names)==1) and ("MINIGRAPH" not in name) and transfer): # len=1 if there is only the source genome. dont parse all the walks if no transfer (only graph annotation)
        path_start=int(line[4])

--- a/GrAnnoT/intersect.py
+++ b/GrAnnoT/intersect.py
@@ -33,7 +33,7 @@ def run_intersect(args):
        if args.verbose:
            print(f'     Found {len(files)} paths corresponding to the source genome haplotype {args.source_haplotype}')
    else:
-        files=list(seg_coord_path.glob(f"*{args.source_genome}*.bed")) # get all bed files from the source genome in seg_coord
+        files=list(seg_coord_path.glob(f"{args.source_genome}#*.bed")) # get all bed files from the source genome in seg_coord
        message='\33[31m'+f'Error : No bed files corresponding to the source genome found in {seg_coord_path}. Please check that the source genome name is right.'+'\033[0m'
        if len(files)==0:
            sys.exit(message)

--- a/GrAnnoT/load_gfa.py
+++ b/GrAnnoT/load_gfa.py
@@ -2,8 +2,9 @@ from tqdm import tqdm

 # get the genome name that corresponds to the file_name (file_name if genome_name+walk_name). not universal..?
 def get_genome_name(target_genomes,file_name):
+    genome_name=file_name.split("#")[0]
    for genome in target_genomes:
-        if genome in file_name:
+        if genome == genome_name:
            return genome
    return ""

@@ -63,8 +64,8 @@ def get_paths(walks_file,target_genome,haplotype):
            if haplotype:
                seq_name=line[1]+"#"+line[2]+"#"+line[3]
            else:
-                seq_name=line[1]+'_'+line[3]
-            if target_genome in seq_name: # get the walk of the genome
+                seq_name=line[1]+'#'+line[3]
+            if target_genome == line[1]: # get the walk of the genome
                path=line[6].split(',')[1:]
                list_segments=[]
                for segment in path: