added the conversion of path lines in walk format to be read by grannot. the...

added the conversion of path lines in walk format to be read by grannot. the path name must be #assembly_name#haplotype#sequence_name, for instance IRGSP#1#Chr1

added the conversion of path lines in walk format to be read by grannot. the...
728439c2 · nina.marthe_ird.fr · 4e16261a · 728439c2
Commit 728439c2 authored 10 months ago by nina.marthe_ird.fr
--- a/GrAnnoT/getSegmentsCoordinates.py
+++ b/GrAnnoT/getSegmentsCoordinates.py
 import subprocess
+import sys

 # creates a dictionnary with the length of the segments
 def get_seg_len(gfa):
@@ -9,13 +10,11 @@ def get_seg_len(gfa):
    # build a dictionnary with the segment sizes
    segments_size={}
    with open('seg_coord/segments.txt','r') as seg_file:
-        line=seg_file.readline()
-        while line:
+        for line in seg_file:
            line=line.split()
            seg_id='s'+line[1]
            seg_size=len(line[2])
            segments_size[seg_id]=seg_size
-            line=seg_file.readline()
    return segments_size

 # check that a walk in "walk_names" has "name" in it (check that "name" is a valid walk from the list)
@@ -27,6 +26,51 @@ def check_walk_name(walk_names,name):
            break
    return name_found

+def get_walks_paths(gfa):
+    # get the lines that start with "W"
+    command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
+    subprocess.run(command,shell=True,timeout=None)
+    w_lines =  subprocess.Popen("wc -l seg_coord/walks.txt", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
+    if w_lines[0]=='0':
+        command="grep ^P "+gfa+" > seg_coord/paths.txt"
+        subprocess.run(command,shell=True,timeout=None)
+        p_lines =  subprocess.Popen("wc -l seg_coord/paths.txt", shell=True, stdout=subprocess.PIPE).stdout.read().decode()
+
+        if p_lines[0]=='0':
+            command="rm seg_coord/walks.txt && rm seg_coord/paths.txt"
+            subprocess.run(command,shell=True,timeout=None)
+            sys.exit("No walks or paths detected in the GFA file")
+        else:
+            with open("seg_coord/paths.txt","r") as path_file, open("seg_coord/walks.txt",'w') as walk_file:
+                for path_line in path_file:
+                    path_line=path_line.split()
+                    name=path_line[1].split("#") #   assembly#hapl#chr # todo
+                    [asm,hap,seq]=name
+                    # seq=seq.split(':')[0] <- needed for the paths that come from a conversion from walk to path. with paths from pggb graph, not useful.
+                    if ',' in path_line[2]:
+                        path=path_line[2].split(',') # 1+,2+,3+,4-,5+,7+ can be separated by , or ;
+                    else:
+                        path=path_line[2].split(';')
+                    # error message if no , or ; but if there is only one segment there is no separator......
+                    # expect no overlap !
+                    line=f'W\t{asm}\t{hap}\t{seq}\t0\t-\t'
+                    for seg in path:
+                        if seg[-1]=="-":
+                            strand="<"
+                        else:
+                            strand=">"
+                        seg_stranded=strand+seg[:-1]
+                        line+=','
+                        line+=seg_stranded
+                    line+="\n"
+                    walk_file.write(line)
+            command="rm seg_coord/paths.txt"
+            subprocess.run(command,shell=True,timeout=None)
+                        
+    walk_path="seg_coord/walks.txt"
+    return walk_path
+
+
 # outputs the coordinates of the segments on the walks in bed files
 def seg_coord(gfa,walk_names):

@@ -36,15 +80,14 @@ def seg_coord(gfa,walk_names):

    segments_size=get_seg_len(gfa)

-    # get the lines that start with "W"
-    command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
-    subprocess.run(command,shell=True,timeout=None)
+    # get the walks or paths
+    walk_path=get_walks_paths(gfa)

    # on these lines, get the name of the genome to name the output bed file
    file_names=list()
-    with open('seg_coord/walks.txt','r') as walk_file:
-        line=walk_file.readline()
-        while line:
+    with open(walk_path,'r') as walk_file:
+
+        for line in walk_file:
            line=line.split()
            name=line[1]+"_"+line[3]

@@ -75,5 +118,4 @@ def seg_coord(gfa,walk_names):
                        out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
                        output_bed.write(out_line)

-                        position=seg_stop
-            line=walk_file.readline()
\ No newline at end of file
+                        position=seg_stop
\ No newline at end of file