From e2f72e91ec4a3c86bbcf6372d1a1bb1d11203b0a Mon Sep 17 00:00:00 2001
From: SimonBache <simon.bache@etu.unistra.fr>
Date: Fri, 24 Jun 2022 11:33:36 +0200
Subject: [PATCH] add rules cazymes orthofinder interproscan phibase

---
 snakefile | 239 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 236 insertions(+), 3 deletions(-)

diff --git a/snakefile b/snakefile
index 8383686..bf15de5 100644
--- a/snakefile
+++ b/snakefile
@@ -5,6 +5,7 @@ output_dir = config["DATA"]["OUTPUT"]
 log_dir = f"{output_dir}LOGS/"
 script_dir = config["DATA"]["SCRIPTS"]
 gff_dir = config["DATA"]["GFF"]
+dbcan_db = config["DATA"]["DBCAN_DB"]
 
 PROTEIN, = glob_wildcards(fasta_prot_dir+"{samples}.fasta", followlinks=True)
 
@@ -32,8 +33,15 @@ def get_threads(rule, default):
 
 rule finale:
     input:
-        domain_prot = expand(f"{output_dir}3_HMMER_PFAM/{{samples}}_secreted.tbl", samples = PROTEIN),
-        effector_contig = expand(f"{output_dir}5_FINAL_RESULT/EFFECTOR/{{samples}}/{{samples}}_effector_per_contig.txt", samples = PROTEIN)
+        domain_prot=expand(f"{output_dir}3_HMMER_PFAM/{{samples}}_secreted.tbl",samples=PROTEIN),
+        effector_contig=expand(f"{output_dir}5_FINAL_RESULT/EFFECTOR/{{samples}}/{{samples}}_effector_per_contig.txt",samples=PROTEIN),
+        cazyme_counts_list=expand(f"{output_dir}6_CAZYMES/dbcan_{{samples}}/{{samples}}_cazyme_count.csv",samples=PROTEIN),
+        orthogroups_sequences=expand(f"{output_dir}7_ORTHOFINDER/Results_orthofinder/sequences_specific/prot_specific_{{samples}}.fasta",samples=PROTEIN),
+        csv_orthogroups=expand(f"{output_dir}7_ORTHOFINDER/Results_orthofinder/sequences_specific/OG_specific_{{samples}}.csv",samples=PROTEIN),
+        dbcan_list=expand(f"{output_dir}6_CAZYMES/dbcan_{{samples}}/overview.txt",samples=PROTEIN),
+        interpro_gff_list=expand(f"{output_dir}8_INTERPROSCAN/{{samples}}/{{samples}}.fasta.gff3",samples=PROTEIN),
+        blast_phibase=expand(f"{output_dir}9_PHIBASE/{{samples}}/{{samples}}_blast_phibase.out",samples=PROTEIN)
+        #gff_cazymes_list = expand(f"{output_dir}GFF_with_cazymes/{{samples}}_gff.csv",samples=PROTEIN)
 
 
 rule rename_protein:
@@ -617,4 +625,229 @@ rule count_effector:
         """
         python {script_dir}count_effectors.py -g {input.gff_protein} -o {output.effector_per_contig} -fasta {input.fasta_effectors} 1>{log.output} 2>{log.error}
         sort -V {output.effector_per_contig} -o {output.effector_per_contig}
-        """
\ No newline at end of file
+        """
+
+rule dbcan2:
+    threads: get_threads("dbcan2", 2)
+    input:
+        fasta_proteins = rules.rename_protein.output.sorted_protein
+    params:
+        dbcan_dir = f"{output_dir}6_CAZYMES/dbcan_{{samples}}/",
+        database_dir = dbcan_db
+    output:
+        dbcan_result = f"{output_dir}6_CAZYMES/dbcan_{{samples}}/overview.txt"
+    log :
+        error = f'{log_dir}dbcan/{{samples}}.e',
+        output = f'{log_dir}dbcan/{{samples}}.o'
+    message:
+        f"""
+                 Running {{rule}}
+                    Input:
+                        - prot : {{input.fasta_proteins}}
+                    Output:
+                        - result: {{output.dbcan_result}}
+                    Others
+                        - Threads : {{threads}}
+                        - LOG error: {{log.error}}
+                        - LOG output: {{log.output}}
+                """
+    envmodules:
+        "run-dbcan/2.0.11"
+    shell:
+        "run_dbcan.py {input.fasta_proteins} protein --out_dir {params.dbcan_dir} --db_dir {params.database_dir} 1>{log.output} 2>{log.error}"
+
+
+rule cazyme_count:
+    threads: get_threads("cazyme_count", 1)
+    input:
+        dbcan_file = rules.dbcan2.output.dbcan_result,
+        gff_renamed = rules.sort_gff.output.gff_sorted
+    output:
+        cazyme_count = f"{output_dir}6_CAZYMES/dbcan_{{samples}}/{{samples}}_cazyme_count.csv"
+    log:
+        error=f'{log_dir}caz_count/{{samples}}.e',
+        output=f'{log_dir}caz_count/{{samples}}.o'
+    message:
+        f"""
+                     Running {{rule}}
+                        Input:
+                            - dbcan_file : {{input.dbcan_file}}
+                        Output:
+                            - cazyme_count: {{output.cazyme_count}}
+                        Others
+                            - Threads : {{threads}}
+                            - LOG error: {{log.error}}
+                            - LOG output: {{log.output}}
+                    """
+    shell: f"python {script_dir}dbcan.py -i {{input.dbcan_file}} -g {{input.gff_renamed}} -o {{output.cazyme_count}} 1>{{log.output}} 2>{{log.error}}"
+
+'''
+rule add_caz_to_gff:
+    threads: get_threads("add_caz_to_gff", 1)
+    input:
+        dbcan_file = rules.dbcan2.output.dbcan_result,
+        gff_renamed = rules.rename_id_gff.output.gff_renamed
+    output:
+        gff_caz = f"{output_dir}GFF_with_cazymes/{{samples}}_gff.csv"
+    log:
+        error=f'{log_dir}caz_gff/{{samples}}.e',
+        output=f'{log_dir}caz_gff/{{samples}}.o'
+    message:
+        f"""
+                         Running {{rule}}
+                            Input:
+                                - dbcan_file : {{input.dbcan_file}}
+                            Output:
+                                - gff: {{output.gff_caz}}
+                            Others
+                                - Threads : {{threads}}
+                                - LOG error: {{log.error}}
+                                - LOG output: {{log.output}}
+                        """
+    shell: "python cazymes_add_gff.py -i {input.dbcan_file} -g {input.gff_renamed} -o {output.gff_caz}"
+'''
+
+rule orthofinder:
+    threads: get_threads("orthofinder", 10)
+    input:
+        prot_files = expand(f"{output_dir}1_PROTEIN_SORTED/{{samples}}.fasta", samples = PROTEIN)
+    params:
+        dir_prot = f"{output_dir}1_PROTEIN_SORTED/",
+        dir_out = f"{output_dir}7_ORTHOFINDER/",
+        name_dir = "orthofinder"
+    output:
+        orthogroups_file = f"{output_dir}7_ORTHOFINDER/Results_orthofinder/Orthogroups/Orthogroups.txt",
+        orthogroups_table= f"{output_dir}7_ORTHOFINDER/Results_orthofinder/Orthogroups/Orthogroups.GeneCount.tsv"
+    log:
+        error=f'{log_dir}orthofinder/orthofinder.e',
+        output=f'{log_dir}orthofinder/orthofinder.o'
+    message:
+        f"""
+                         Running {{rule}}
+                            Input:
+                                - prot_files : {{input.prot_files}}
+                            Output:
+                                - orthogroups: {{output.orthogroups_file}}
+                            Others
+                                - Threads : {{threads}}
+                                - LOG error: {{log.error}}
+                                - LOG output: {{log.output}}
+                        """
+    envmodules:
+        "orthofinder/2.5.2"
+    shell:
+        """
+        rm -rf {output_dir}7_ORTHOFINDER/
+        orthofinder -o {params.dir_out} -n {params.name_dir} -f {params.dir_prot} -t 10 -M msa -a 10 1>{log.output} 2>{log.error}
+        """
+
+rule orthofinder_parse:
+    threads: get_threads("orthofinder_parse", 1)
+    input:
+        orthogroups_table= f"{output_dir}7_ORTHOFINDER/Results_orthofinder/Orthogroups/Orthogroups.GeneCount.tsv"
+    params:
+        strain_name = f"{{samples}}",
+        path_seq = f"{output_dir}7_ORTHOFINDER/Results_orthofinder/Orthogroup_Sequences/"
+    output:
+        fasta_og = f"{output_dir}7_ORTHOFINDER/Results_orthofinder/sequences_specific/prot_specific_{{samples}}.fasta",
+        csv_orthogroups_specific = f"{output_dir}7_ORTHOFINDER/Results_orthofinder/sequences_specific/OG_specific_{{samples}}.csv"
+    log:
+        error=f'{log_dir}orthofinder_parse/{{samples}}.e',
+        output=f'{log_dir}orthofinder_parse/{{samples}}.o'
+    message:
+        f"""
+                             Running {{rule}}
+                                Input:
+                                    - orthogroups_table : {{input.orthogroups_table}}
+                                Output:
+                                    - fasta: {{output.fasta_og}}
+                                Others
+                                    - Threads : {{threads}}
+                                    - LOG error: {{log.error}}
+                                    - LOG output: {{log.output}}
+                            """
+    shell:
+        f"python {script_dir}orthofinder_parse.py -t {{input.orthogroups_table}} -n {{params.strain_name}} -p {{params.path_seq}} -f {{output.fasta_og}} -c {{output.csv_orthogroups_specific}} 1>{{log.output}} 2>{{log.error}}"
+
+rule interproscan:
+    threads: get_threads("interproscan", 4)
+    input:
+        fasta_proteins = rules.rename_protein.output.sorted_protein
+    params:
+        directory_output = f"{output_dir}8_INTERPROSCAN/{{samples}}/",
+        interpro_params = config["TOOLS_PARAMS"]["INTERPROSCAN"]
+    output:
+        gff = f"{output_dir}8_INTERPROSCAN/{{samples}}/{{samples}}.fasta.gff3"
+    log:
+        error=f'{log_dir}interproscan/{{samples}}.e',
+        output=f'{log_dir}interproscan/{{samples}}.o'
+    message:
+        f"""
+                                 Running {{rule}}
+                                    Input:
+                                        - fasta_proteins : {{input.fasta_proteins}}
+                                    Output:
+                                        - gff: {{output.gff}}
+                                    Others
+                                        - Threads : {{threads}}
+                                        - LOG error: {{log.error}}
+                                        - LOG output: {{log.output}}
+                                """
+    envmodules:
+        "interproscan/5.54-87.0"
+    shell:
+        "interproscan.sh -i {input.fasta_proteins} -d {params.directory_output} {params.interpro_params} 1>{log.output} 2>{log.error}"
+
+rule makeblast_db_phibase:
+    threads: get_threads("makeblast_db_phibase", 1)
+    input:
+        phibase_database = f"{phibase_db}phi-base_current.fasta"
+    output:
+        complete_database = f"{phibase_db}phi-base_current.fasta.phr"
+    log:
+        error=f'{log_dir}phi_base_db.e',
+        output=f'{log_dir}phi_base_db.o'
+    message:
+        f"""
+                                     Running {{rule}}
+                                        Input:
+                                            - database : {{input.phibase_database}}
+                                        Output:
+                                            - complete_database: {{output.complete_database}}
+                                        Others
+                                            - Threads : {{threads}}
+                                            - LOG error: {{log.error}}
+                                            - LOG output: {{log.output}}
+                                    """
+    envmodules:
+        "blast"
+    shell:
+        "makeblastdb -in {input.phibase_database} -dbtype prot 1>{log.output} 2>{log.error}"
+
+rule phibase:
+    threads: get_threads("phibase", 4)
+    input:
+        fasta_effectors = rules.effectorP.output.fasta_effectors,
+        phibase_database = f"{phibase_db}phi-base_current.fasta",
+        verif = rules.makeblast_db_phibase.output.complete_database
+    output:
+        blast_result = f"{output_dir}9_PHIBASE/{{samples}}/{{samples}}_blast_phibase.out"
+    log:
+        error=f'{log_dir}phi_base/{{samples}}.e',
+        output=f'{log_dir}phi_base/{{samples}}.o'
+    message:
+        f"""
+                                         Running {{rule}}
+                                            Input:
+                                                - database : {{input.phibase_database}}
+                                            Output:
+                                                - blast_results: {{output.blast_result}}
+                                            Others
+                                                - Threads : {{threads}}
+                                                - LOG error: {{log.error}}
+                                                - LOG output: {{log.output}}
+                                        """
+    envmodules:
+        "blast"
+    shell:
+        "blastp -db {input.phibase_database} -query {input.fasta_effectors} -outfmt 6 -out {output.blast_result}"
\ No newline at end of file
-- 
GitLab