From f2324623607dc59a093fabe819ff66588c0e06ef Mon Sep 17 00:00:00 2001
From: NMarthe <nina.marthe@ird.fr>
Date: Tue, 31 Oct 2023 12:52:31 +0100
Subject: [PATCH] =?UTF-8?q?corrig=C3=A9=20la=20detection=20des=20chromosom?=
 =?UTF-8?q?es?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 seg_coord/getSegmentsCoordinates.py | 31 ++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/seg_coord/getSegmentsCoordinates.py b/seg_coord/getSegmentsCoordinates.py
index 27c5704..3eccffa 100644
--- a/seg_coord/getSegmentsCoordinates.py
+++ b/seg_coord/getSegmentsCoordinates.py
@@ -1,6 +1,27 @@
 import subprocess
 import sys
 
+def has_numbers(inputString):
+    return any(char.isdigit() for char in inputString)
+
+def getChrName(chromosome_field):
+    chromosome_id=""
+    if has_numbers(chromosome_field):
+        for char in reversed(chromosome_field): # take the last digits of the field
+            if not char.isdigit():
+                break
+            else:
+                chromosome_id+=char
+        chromosome_id="Chr"+chromosome_id[::-1]
+    else:
+        for char in reversed(chromosome_field): # take the last uppercase chars of the fied
+            if not char.isupper():
+                break
+            else:
+                chromosome_id+=char
+        chromosome_id="Chr"+chromosome_id[::-1]
+    return chromosome_id
+
 if not(len(sys.argv)==2) :
         print("expected input : gfa file with walks.")
         print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).")
@@ -38,12 +59,12 @@ walks.close()
 file_names=list()
 for line in lines :
     line=line.split()
-    name=line[1]
-    chr_field=line[3].split('_')
-    chromosome=chr_field[len(chr_field)-1]
+    name=line[3]
     path_start=int(line[4])
+    chromosome_field=line[3]
+    chromosome_id=getChrName(chromosome_field)
 
-    file_name=name+'_'+chromosome+'.bed'
+    file_name=name+'.bed'
 
     # if we are writing in the file for the first time, overwrite it. else, append it
     # this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
@@ -59,7 +80,7 @@ for line in lines :
     for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
         # coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size
         
-        chr='Chr'+chromosome[len(chromosome)-2:]
+        chr='Chr'+chromosome_id[len(chromosome_id)-2:]
         
         seg_start=position
         seg_name='s'+path[i][1:]
-- 
GitLab