From 6f09e8728c1c91a4b50b1e06bc9e33ac5174d10b Mon Sep 17 00:00:00 2001
From: "nina.marthe_ird.fr" <nina.marthe@ird.fr>
Date: Thu, 15 Feb 2024 13:18:58 +0100
Subject: [PATCH] adapted the code to select the genomes to handle, and treat
 assemblies with contigs instead of chromosomes

---
 getSegmentsCoordinates.py | 186 +++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/getSegmentsCoordinates.py b/getSegmentsCoordinates.py
index dbd5695..f6f7f81 100644
--- a/getSegmentsCoordinates.py
+++ b/getSegmentsCoordinates.py
@@ -1,107 +1,107 @@
 import subprocess
-import sys
 
 def has_numbers(inputString):
     return any(char.isdigit() for char in inputString)
 
-def getChrName(chromosome_field):
-    chromosome_id=""
-    if has_numbers(chromosome_field):
-        for char in reversed(chromosome_field): # take the last digits of the field
+def getSeqNumber(name_field):
+    seq_number=""
+    if has_numbers(name_field[-3:]):
+        for char in reversed(name_field): # take the last digits of the field
             if not char.isdigit():
                 break
             else:
-                chromosome_id+=char
-        chromosome_id="Chr"+chromosome_id[::-1]
+                seq_number+=char
     else:
-        for char in reversed(chromosome_field): # take the last uppercase chars of the fied
+        for char in reversed(name_field): # take the last uppercase chars of the fied
             if not char.isupper():
                 break
             else:
-                chromosome_id+=char
-        chromosome_id="Chr"+chromosome_id[::-1]
-    return chromosome_id
-
-if not(len(sys.argv)==2) :
-        print("expected input : gfa file with walks.")
-        print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).")
-        sys.exit(1)
-elif (sys.argv[1]=="-h") :
-    print("expected input : gfa file with walks.")
-    print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).")
-    sys.exit(1)
-
-gfa=sys.argv[1]
-
-# get the lines that start with "S"
-command="grep ^S "+gfa+" > segments.txt"
-subprocess.run(command,shell=True)
-segments = open('segments.txt','r')
-lines=segments.readlines()
-segments.close()
-
-# build a dictionnary with the segment sizes
-segments_size={}
-for line in lines:
-    line=line.split()
-    seg_id='s'+line[1]
-    seg_size=len(line[2])
-    segments_size[seg_id]=seg_size
-
-# get the lines that start with "W"
-command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > walks.txt"
-subprocess.run(command,shell=True)
-walks=open('walks.txt','r')
-lines=walks.readlines()
-walks.close()
-
-# on these lines, get the name of the genome to name the output bed file
-file_names=list()
-for line in lines :
-    line=line.split()
-    name=line[3]
-    path_start=int(line[4])
-    chromosome_field=line[3]
-    chromosome_id=getChrName(chromosome_field)
-
-    file_name=name+'.bed'
-
-    # if we are writing in the file for the first time, overwrite it. else, append it
-    # this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
-    if file_name not in file_names:
-        file_names.append(file_name)
-        out_bed = open(file_name, 'w')
-    else :
-        out_bed = open(file_name, 'a')
-        
-    path=line[6].split(',')
-    position=path_start
+                seq_number+=char
+    return seq_number[::-1]
     
-    for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
-        # coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size
-        
-        chr='Chr'+chromosome_id[len(chromosome_id)-2:]
-        
-        seg_start=position
-        seg_name='s'+path[i][1:]
-        seg_stop=position+segments_size[seg_name]
-
-        if path[i][0:1]=='>':
-            orientation='+'
-        elif path[i][0:1]=='<':
-            orientation='-'
-        else:
-            orientation='.'
+def get_seg_len(gfa):
+
+    # get the lines that start with "S"
+    command="grep ^S "+gfa+" > seg_coord/segments.txt"
+    subprocess.run(command,shell=True,timeout=None)
+    segments = open('seg_coord/segments.txt','r')
+    lines=segments.readlines()
+    segments.close()
+
+    # build a dictionnary with the segment sizes
+    segments_size={}
+    for line in lines:
+        line=line.split()
+        seg_id='s'+line[1]
+        seg_size=len(line[2])
+        segments_size[seg_id]=seg_size
+
+    return segments_size
+
+def check_walk_name(walk_names,name):
+    name_found=False
+    for walk in walk_names:
+        if walk in name:
+            name_found=True
+            break
+    return name_found # rename name_found var
+
+def seg_coord(gfa,walk_names):
+
+    # create directory to store output files
+    command="mkdir seg_coord"
+    subprocess.run(command,shell=True)
+
+    segments_size=get_seg_len(gfa)
+
+    # get the lines that start with "W"
+    command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
+    subprocess.run(command,shell=True,timeout=None)
+    walks=open('seg_coord/walks.txt','r')
+    lines=walks.readlines()
+    walks.close()
+
+    # on these lines, get the name of the genome to name the output bed file
+    file_names=list()
+    for line in lines :
+        line=line.split()
+        name=line[3]
+
+        if check_walk_name(walk_names,name) | (len(walk_names)==1): # len=1 if there is only the source genome.
+
+            path_start=int(line[4])
+            seq_name=name.split('_')[-1]
+            if (seq_name[0:10]=="chromosome") | (seq_name[0:10]=="Chromosome"):
+                seq_name="Chr"+seq_name[10:]
+
+            file_name="seg_coord/"+name+'.bed'
+
+            # if we are writing in the file for the first time, overwrite it. else, append it
+            # this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
+            if file_name not in file_names:
+                file_names.append(file_name)
+                out_bed = open(file_name, 'w')
+            else :
+                out_bed = open(file_name, 'a')
+                
+            path=line[6].split(',')
+            position=path_start
+            
+            for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
+                # coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size
+                
+                seg_start=position
+                seg_name='s'+path[i][1:]
+                seg_stop=position+segments_size[seg_name]
+            
+                out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
+                out_bed.write(out_line)
+                
+                position+=segments_size[seg_name]
+            out_bed.close()
     
-        out_line=chr+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
-        out_bed.write(out_line)
-        
-        position+=segments_size[seg_name]
-    out_bed.close()
- 
-command="rm segments.txt && rm walks.txt"
-subprocess.run(command,shell=True)
-
-
-command="if [ 0 -lt $(ls _MINIGRAPH_* 2>/dev/null | wc -w) ]; then mkdir minigraph_segments && mv _MINIGRAPH_* minigraph_segments; fi"
-subprocess.run(command,shell=True)
\ No newline at end of file
+    command="rm seg_coord/segments.txt && rm seg_coord/walks.txt"
+    subprocess.run(command,shell=True,timeout=None)
+
+    command="if ls test/*_MINIGRAPH_* 1> /dev/null 2>&1; then mkdir seg_coord/minigraph_segments && mv *_MINIGRAPH_* seg_coord/minigraph_segments/; fi"
+    subprocess.run(command,shell=True,timeout=None)
\ No newline at end of file
-- 
GitLab