From 060713b0476874312abae3b6e0f659f7d5fc912a Mon Sep 17 00:00:00 2001 From: tdurand <theo.durand@etu.uca.fr> Date: Wed, 29 Jun 2022 10:08:06 +0200 Subject: [PATCH] add rules and script parse hmmer --- cluster_config_SLURM.yaml | 32 ++++++++++++++++++++++++ scripts/hmmer_parse.py | 52 +++++++++++++++++++++++++++++++++++++++ snakefile | 29 +++++++++++++++++++++- 3 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 scripts/hmmer_parse.py diff --git a/cluster_config_SLURM.yaml b/cluster_config_SLURM.yaml index 1f045d5..00088ab 100644 --- a/cluster_config_SLURM.yaml +++ b/cluster_config_SLURM.yaml @@ -93,3 +93,35 @@ effectorP: count_effector: cpus-per-task: 1 partition: fast + +dbcan2: + cpus-per-task: 2 + partition: fast + +cazyme_count: + cpus-per-task: 1 + partition: fast + +orthofinder: + cpus-per-task: 10 + partition: fast + +orthofinder_parse: + cpus-per-task: 1 + partition: fast + +interproscan: + cpus-per-task: 4 + partition: fast + +makeblast_db_phibase: + cpus-per-task: 1 + partition: fast + +phibase: + cpus-per-task: 1 + partition: fast + +parse_hmmer: + cpus-per-task: 1 + partition: fast diff --git a/scripts/hmmer_parse.py b/scripts/hmmer_parse.py new file mode 100644 index 0000000..2c114b7 --- /dev/null +++ b/scripts/hmmer_parse.py @@ -0,0 +1,52 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import pandas as pd +import click +from Bio import SearchIO + + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--hmmer_file', '-f', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to output of hmmer file') +@click.option('--parse_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to create output of hmmer results parsed') +def main(hmmer_file, parse_output): + """ This program parse the output .tbl of hmmer """ + hits = defaultdict(list) + test = set() + attribs = ['id','bitscore','evalue'] + query = defaultdict(list) + with open(hmmer_file) as f1: + for lignes in f1: + if re.search('^[^#]',lignes): + ligne = lignes.split() + query["query_name"].append(ligne[2]) + query["accession"].append(ligne[3]) + test.add(ligne[0]) + + with open(hmmer_file) as f2: + for record in SearchIO.parse(f2, 'hmmer3-tab'): + for hit in record.hits: + for attrib in attribs: + hits[attrib].append(getattr(hit, attrib)) + table = pd.DataFrame.from_dict(hits) + table2 = pd.DataFrame.from_dict(query) + frames = [table,table2] + results = pd.concat(frames,axis=1) + print(results) + sort_table = (results.sort_values(by='evalue')) + sort_table = sort_table.drop_duplicates(subset=["id"],keep='first') + #print(sort_table) + #print(len(test)) + sort_table.to_csv(parse_output,index=False) + + + + + +if __name__ == '__main__': + main() diff --git a/snakefile b/snakefile index dd3a91c..bc11ac3 100644 --- a/snakefile +++ b/snakefile @@ -41,7 +41,8 @@ rule finale: csv_orthogroups = expand(f"{output_dir}7_ORTHOFINDER/Results_orthofinder/sequences_specific/OG_specific_{{samples}}.csv",samples=PROTEIN), dbcan_list = expand(f"{output_dir}6_CAZYMES/dbcan_{{samples}}/overview.txt", samples = PROTEIN), interpro_gff_list = expand(f"{output_dir}8_INTERPROSCAN/{{samples}}/{{samples}}.fasta.gff3", samples = PROTEIN), - blast_phibase = expand(f"{output_dir}9_PHIBASE/{{samples}}/{{samples}}_blast_phibase.out", samples = PROTEIN) + blast_phibase = expand(f"{output_dir}9_PHIBASE/{{samples}}/{{samples}}_blast_phibase.out", samples = PROTEIN), + hmmer_parsed= expand(f"{output_dir}3_HMMER_PFAM/PARSED_FILE/{{samples}}_parsed.csv",samples=PROTEIN) #gff_cazymes_list = expand(f"{output_dir}GFF_with_cazymes/{{samples}}_gff.csv",samples=PROTEIN) @@ -541,6 +542,32 @@ rule hmmer_pfam : shell: f"hmmsearch --tblout {{output.protein_secreted_domain}} {{params.param_hmmer}} {{input.bdd_pfam}} {{input.fasta_secreted}} 1>{{log.output}} 2>{{log.error}}" +rule parse_hmmer : + threads: get_threads("parse_hmmer",1) + input: + result_hmmer = rules.hmmer_pfam.output.protein_secreted_domain + output: + hmmer_parsed = f"{output_dir}3_HMMER_PFAM/PARSED_FILE/{{samples}}_parsed.csv" + log: + error=f'{log_dir}hmmer_pfam/hmmer_pfam_parsed_{{samples}}.e', + output=f'{log_dir}hmmer_pfam/hmmer_pfam_parsed_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - HMMER RESULT : {{input.result_hmmer}} + Output: + - HMMER PARSED : {{output.hmmer_parsed}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}hmmer_parse.py -f {{input.result_hmmer}} -o {{output.hmmer_parsed}} 1>{{log.output}} 2>{{log.error}}" + + rule effectorP : threads: get_threads("effectorP", 10) input: -- GitLab