From 060713b0476874312abae3b6e0f659f7d5fc912a Mon Sep 17 00:00:00 2001
From: tdurand <theo.durand@etu.uca.fr>
Date: Wed, 29 Jun 2022 10:08:06 +0200
Subject: [PATCH] add rules and script parse hmmer

---
 cluster_config_SLURM.yaml | 32 ++++++++++++++++++++++++
 scripts/hmmer_parse.py    | 52 +++++++++++++++++++++++++++++++++++++++
 snakefile                 | 29 +++++++++++++++++++++-
 3 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 scripts/hmmer_parse.py

diff --git a/cluster_config_SLURM.yaml b/cluster_config_SLURM.yaml
index 1f045d5..00088ab 100644
--- a/cluster_config_SLURM.yaml
+++ b/cluster_config_SLURM.yaml
@@ -93,3 +93,35 @@ effectorP:
 count_effector:
     cpus-per-task: 1
     partition: fast
+    
+dbcan2:
+    cpus-per-task: 2
+    partition: fast
+    
+cazyme_count:
+    cpus-per-task: 1
+    partition: fast
+
+orthofinder:
+    cpus-per-task: 10
+    partition: fast
+    
+orthofinder_parse:
+    cpus-per-task: 1
+    partition: fast
+    
+interproscan:
+    cpus-per-task: 4
+    partition: fast
+
+makeblast_db_phibase:
+    cpus-per-task: 1
+    partition: fast
+
+phibase:
+    cpus-per-task: 1
+    partition: fast
+
+parse_hmmer:
+    cpus-per-task: 1
+    partition: fast
diff --git a/scripts/hmmer_parse.py b/scripts/hmmer_parse.py
new file mode 100644
index 0000000..2c114b7
--- /dev/null
+++ b/scripts/hmmer_parse.py
@@ -0,0 +1,52 @@
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import pandas as pd
+import click
+from Bio import SearchIO
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--hmmer_file', '-f', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output of hmmer file')
+@click.option('--parse_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to create output of hmmer results parsed')
+def main(hmmer_file, parse_output):
+    """ This program parse the output .tbl of hmmer """
+    hits = defaultdict(list)
+    test = set()
+    attribs = ['id','bitscore','evalue']
+    query = defaultdict(list)
+    with open(hmmer_file) as f1:
+        for lignes in f1:
+            if re.search('^[^#]',lignes):
+                ligne = lignes.split()
+                query["query_name"].append(ligne[2])
+                query["accession"].append(ligne[3])
+                test.add(ligne[0])
+
+    with open(hmmer_file) as f2:
+        for record in SearchIO.parse(f2, 'hmmer3-tab'):
+            for hit in record.hits:
+                for attrib in attribs:
+                    hits[attrib].append(getattr(hit, attrib))
+    table = pd.DataFrame.from_dict(hits)
+    table2 = pd.DataFrame.from_dict(query)
+    frames = [table,table2]
+    results = pd.concat(frames,axis=1)
+    print(results)
+    sort_table = (results.sort_values(by='evalue'))
+    sort_table = sort_table.drop_duplicates(subset=["id"],keep='first')
+    #print(sort_table)
+    #print(len(test))
+    sort_table.to_csv(parse_output,index=False)
+
+
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/snakefile b/snakefile
index dd3a91c..bc11ac3 100644
--- a/snakefile
+++ b/snakefile
@@ -41,7 +41,8 @@ rule finale:
         csv_orthogroups = expand(f"{output_dir}7_ORTHOFINDER/Results_orthofinder/sequences_specific/OG_specific_{{samples}}.csv",samples=PROTEIN),
         dbcan_list = expand(f"{output_dir}6_CAZYMES/dbcan_{{samples}}/overview.txt", samples = PROTEIN),
         interpro_gff_list = expand(f"{output_dir}8_INTERPROSCAN/{{samples}}/{{samples}}.fasta.gff3", samples = PROTEIN),
-        blast_phibase = expand(f"{output_dir}9_PHIBASE/{{samples}}/{{samples}}_blast_phibase.out", samples = PROTEIN)
+        blast_phibase = expand(f"{output_dir}9_PHIBASE/{{samples}}/{{samples}}_blast_phibase.out", samples = PROTEIN),
+        hmmer_parsed= expand(f"{output_dir}3_HMMER_PFAM/PARSED_FILE/{{samples}}_parsed.csv",samples=PROTEIN)
         #gff_cazymes_list = expand(f"{output_dir}GFF_with_cazymes/{{samples}}_gff.csv",samples=PROTEIN)
 
 
@@ -541,6 +542,32 @@ rule hmmer_pfam :
     shell:
         f"hmmsearch --tblout {{output.protein_secreted_domain}} {{params.param_hmmer}} {{input.bdd_pfam}} {{input.fasta_secreted}} 1>{{log.output}} 2>{{log.error}}"
 
+rule parse_hmmer :
+    threads: get_threads("parse_hmmer",1)
+    input:
+        result_hmmer = rules.hmmer_pfam.output.protein_secreted_domain
+    output:
+        hmmer_parsed = f"{output_dir}3_HMMER_PFAM/PARSED_FILE/{{samples}}_parsed.csv"
+    log:
+        error=f'{log_dir}hmmer_pfam/hmmer_pfam_parsed_{{samples}}.e',
+        output=f'{log_dir}hmmer_pfam/hmmer_pfam_parsed_{{samples}}.o'
+    message:
+        f"""
+                                                    Running {{rule}}
+                                                       Input:
+                                                           - HMMER RESULT : {{input.result_hmmer}}
+                                                       Output:
+                                                           - HMMER PARSED : {{output.hmmer_parsed}}
+                                                       Others
+                                                           - Threads : {{threads}}
+                                                           - LOG error: {{log.error}}
+                                                           - LOG output: {{log.output}}
+
+                                                   """
+    shell:
+        f"python {script_dir}hmmer_parse.py -f {{input.result_hmmer}} -o {{output.hmmer_parsed}} 1>{{log.output}} 2>{{log.error}}"
+
+
 rule effectorP :
     threads: get_threads("effectorP", 10)
     input:
-- 
GitLab