From ca0706b9b6aa1b8642d3a2537f8f6044a6f106f7 Mon Sep 17 00:00:00 2001
From: SimonBache <simon.bache@etu.unistra.fr>
Date: Tue, 28 Jun 2022 11:39:35 +0200
Subject: [PATCH] add dbcan parse script

---
 scripts/dbcan.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 scripts/dbcan.py

diff --git a/scripts/dbcan.py b/scripts/dbcan.py
new file mode 100644
index 0000000..10c9c1f
--- /dev/null
+++ b/scripts/dbcan.py
@@ -0,0 +1,39 @@
+import pandas as pd
+import click
+
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--dbcan', '-i', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to input dbcan result file')
+@click.option('--gff', '-g', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to input gff file')
+@click.option('--output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output count csv file')
+
+def main(dbcan, gff, output):
+
+    dbcan = pd.read_table(dbcan)
+    dbcan.columns = ["gene", "HMMER", "Hotpep", "Diamond", "#ofTools"]
+    dbcan = dbcan[dbcan["#ofTools"] == 3]
+
+    gff = pd.read_table(gff)
+    gff.columns = ["contig", "source", "feature", "start", "stop", "score_1", "strand", "score_2", "id"]
+    gff = gff[gff["feature"] == "gene"]
+    gff["id"] = gff["id"].str.replace(r'Name=', "")
+    gff["id"] = gff["id"].str.replace(r'Parent=', "")
+    gff["id"] = gff["id"].str.replace(r'\;[0-9a-zA-Z_]*', "")
+    gff["id"] = gff["id"].str.replace(r'ID=', "")
+    print(gff.head())
+    gff = gff[gff["id"].isin(dbcan["gene"])]
+    #gff.reset_index(inplace=True)
+
+    dbcan = pd.concat([dbcan, gff], axis=1)
+    count = dbcan["contig"].value_counts()
+    count.to_csv(str(output))
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
-- 
GitLab