diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d8cdc175f0f96bff6dc2e415662a4c791a7e55f0 --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,13 @@ +DATA: + PROTEIN: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/fasta_prot/" + OUTPUT: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/results/" + SCRIPTS: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/scripts/" + BDD_PFAM: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/BDD_PFAM/Pfam-A.hmm" + GFF: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/GFF3_protein/" + +TOOLS_PARAMS: + PARSE_TARGETP_TRESHOLD: "0.8" + PARSE_SIGNALP_TRESHOLD: "0.8" + PARSE_TMHMM_TRESHOLD: "1" + PARSE_WOLFPOSORT_TRESHOLD: "14" + HMMER: "--cpu 8 -E 0.003" \ No newline at end of file diff --git a/run_worflow.sh b/run_worflow.sh new file mode 100644 index 0000000000000000000000000000000000000000..2a20fd8b971822a1fe318383aebb15b9d53e24c8 --- /dev/null +++ b/run_worflow.sh @@ -0,0 +1,18 @@ +#!/bin/sh +### Job name +#SBATCH --job-name=annotation + +### Requirements +#SBATCH --partition=long + + + +### Output +#SBATCH --output=/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/log_effector.out +#SBATCH --error=/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/log_effector.err +module purge +module load r +module load python/3.7 + + +snakemake --profile effector --use-envmodules diff --git a/scripts/count_effectors.py b/scripts/count_effectors.py new file mode 100644 index 0000000000000000000000000000000000000000..1afcf32ef960cb09cf46ac42d7a75bf7acd0b724 --- /dev/null +++ b/scripts/count_effectors.py @@ -0,0 +1,67 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import pandas as pd +from collections import Counter +import click + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--gff', '-g', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to input gff file') +@click.option('--output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to output file') +@click.option('--fasta_file', '-fasta', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to fasta file of effector') + + +def main(gff, output, fasta_file): + """This count the number of effector per contig""" + + + id_effector = [] + gff_parse = [] + final_effectors = [] + dico_gff = defaultdict(list) + + for record in SeqIO.parse(fasta_file, "fasta"): + id_effector.append(record.id) + + + + with open(gff,"r") as f1 : + for lignes in f1: + ligne = lignes.rstrip("\n") + col = ligne.split("\t") + id_1 = re.sub("ID=", "", col[8]) + id_2 = re.sub(";Name=\w+", "", id_1) + if col[2] =="gene": + gff_parse.append(col[0]+" "+col[1]+" "+col[2]+" "+id_2) + dico_gff[col[0]].append(id_2) + + + + + #print(len(id_effector)) + + + for cle in dico_gff: + for elem in dico_gff[cle]: + for id in id_effector: + if elem == id : + final_effectors.append(cle) + + + counts = pd.Series(final_effectors).value_counts() + counts.to_csv(output, header = False, sep="\t") + #print(counts) + + #print(df) + + +if __name__ == '__main__': + main() + diff --git a/scripts/fasta_intersect.py b/scripts/fasta_intersect.py new file mode 100644 index 0000000000000000000000000000000000000000..712ba2c04d5438d3515a3c6e6bba40b02173ea10 --- /dev/null +++ b/scripts/fasta_intersect.py @@ -0,0 +1,42 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import click + + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--protein_file', '-p', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to fasta protein') +@click.option('--secreted_id', '-s', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to ID with signal peptide') +@click.option('--fasta_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to fasta protein output') + +def main(protein_file, secreted_id, fasta_output): + """This programme use ID of protein with signal peptide and protein fasta file to generate fasta protein with this ID""" + signalpeptide = [] + fasta_prot = defaultdict(str) + + with open(secreted_id, "r") as f1: + for lignes in f1: + without_backspace = lignes.rstrip("\n") + signalpeptide.append(without_backspace) + + for record in SeqIO.parse(protein_file, "fasta"): + fasta_prot[record.id] = record + + fasta_secretion = [] + + for cle in fasta_prot: + for elem in signalpeptide: + if elem == cle: + fasta_secretion.append(fasta_prot[elem]) + + SeqIO.write(fasta_secretion,fasta_output,"fasta") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/gff_sort.py b/scripts/gff_sort.py new file mode 100644 index 0000000000000000000000000000000000000000..6c465f322ff13802e1726fe5ae821cf364440ae6 --- /dev/null +++ b/scripts/gff_sort.py @@ -0,0 +1,35 @@ +import pandas as pd +import click +import re +import os +import sys + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--gff', '-g', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to input gff file') +@click.option('--output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to output file') +@click.option('--strain_name', '-name', default=None, + type=click.STRING, + required=True, show_default=True, help='Name of the strain') + + +def main(gff, output, strain_name): + """This programme rename ID of the gff3 file""" + gene_gff = [] + with open(gff, "r") as f1: + for lignes in f1: + col = lignes.split("\t") + if re.search("gene",col[2]): + id_strain = re.sub("ID=","ID="+strain_name+"_",col[8]) + prot_gff = re.sub(";","T0;",id_strain) + gene_gff.append(col[0]+"\t"+col[1]+"\t"+col[2]+"\t"+col[3]+"\t"+col[4]+"\t"+col[5]+"\t"+col[6]+"\t"+col[7]+"\t"+prot_gff) + + output_file = open(output,"w") + for elem in gene_gff: + output_file.write(elem) + output_file.close() +if __name__ == '__main__': + main() diff --git a/scripts/id_secreted_to_fasta.py b/scripts/id_secreted_to_fasta.py new file mode 100644 index 0000000000000000000000000000000000000000..0399626ae181e3aa210d99ac8a4a31586b342b31 --- /dev/null +++ b/scripts/id_secreted_to_fasta.py @@ -0,0 +1,43 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import click + + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--protein_file', '-fasta', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to fasta protein sorted with tmhmm') +@click.option('--id_secreted_protein', '-id', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to id of secreted protein (output of wolfpsort parsed)') +@click.option('--fasta_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to fasta protein secreted output') + +def main(protein_file, id_secreted_protein, fasta_output): + """This programme use ID of secreted protein with signal peptide and protein fasta file to generate fasta protein with this ID""" + id_secreted = [] + fasta_prot = defaultdict(str) + + with open(id_secreted_protein, "r") as f1: + for lignes in f1: + without_backspace = lignes.rstrip("\n") + id_secreted.append(without_backspace) + + + for record in SeqIO.parse(protein_file, "fasta"): + fasta_prot[record.id] = record + + fasta_secretion = [] + + for cle in fasta_prot: + for elem in id_secreted: + if elem == cle: + fasta_secretion.append(fasta_prot[elem]) + + SeqIO.write(fasta_secretion,fasta_output,"fasta") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/intersect.sh b/scripts/intersect.sh new file mode 100644 index 0000000000000000000000000000000000000000..b083b1a403e79ae859cbb3dcca275bae4973f9bb --- /dev/null +++ b/scripts/intersect.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +intersect() { + for file in "$@"; do + sort -u "$file" + done | sort | uniq -cd | grep "^[^0-9]*$# " | sed -e "s/ [0-9]//" | sed -e "s/ //" +} + + +intersect $1 $2 $3 $4 diff --git a/scripts/parse_phobius.py b/scripts/parse_phobius.py new file mode 100644 index 0000000000000000000000000000000000000000..a53d46129d5249b7b2a1e8ed591a552fc2629a64 --- /dev/null +++ b/scripts/parse_phobius.py @@ -0,0 +1,40 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import pandas as pd +import click + + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--phobius_file', '-t', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to output of phobius file') +@click.option('--spphobius_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to create output of secreted ID of sorted protein') + +def main(phobius_file, spphobius_output): + """ This program retrieve ID of secreted protein of phobius with TM == 0 or TM == 1 """ + sptargetp_id = [] + with open(phobius_file) as f1: + for lignes in f1: + if re.search("^[^SEQUENCE]",lignes): + col = lignes.split() + if col[1] == "0" and col[2] == "Y": + #print(col[0],col[1],col[2]) + sptargetp_id.append(col[0]) + if col[1] == "1" and col[2] == "Y": + #print(col[0], col[1], col[2]) + sptargetp_id.append(col[0]) + + output_id = open(spphobius_output, "w") + for elem in sptargetp_id: + output_id.write(elem+"\n") + output_id.close() + +if __name__ == '__main__': + main() + + + diff --git a/scripts/parse_predgpi.py b/scripts/parse_predgpi.py new file mode 100644 index 0000000000000000000000000000000000000000..9978eb6c1ec834ad954a24fc076a35139ec6a1bb --- /dev/null +++ b/scripts/parse_predgpi.py @@ -0,0 +1,36 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import pandas as pd +import click + + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--predgpi_file', '-p', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to output of predgpi tool') +@click.option('--predgpi_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to create ID of sorted protein with no membrane anchor') + +def main(predgpi_file, predgpi_output): + """ This program retrieve ID of secreted protein of predGPI with no anchor membrane""" + predgpi_id = [] + with open(predgpi_file) as f1: + for lignes in f1: + col = lignes.split("\t") + if col[2] != "GPI-anchor": + #print(col[0],col[2]) + predgpi_id.append(col[0]) + + output_id = open(predgpi_output, "w") + for elem in predgpi_id: + output_id.write(elem+"\n") + output_id.close() + +if __name__ == '__main__': + main() + + + diff --git a/scripts/parse_signalp.py b/scripts/parse_signalp.py new file mode 100644 index 0000000000000000000000000000000000000000..2ad2094526fd4ce241a87424b6ec4cd484e15480 --- /dev/null +++ b/scripts/parse_signalp.py @@ -0,0 +1,40 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import pandas as pd +import click + + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--signalp_file_gff', '-s', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to output of gff3 of signalp') +@click.option('--spsignalp_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to create secreted ID of sorted protein') +@click.option('--treshold', '-th', default=None, + type=click.FloatRange(min=0, max=1, min_open=True, max_open=True), + required=True, show_default=True, help='treshold for protein with signal peptide (between 0 & 1)') + +def main(signalp_file_gff, spsignalp_output, treshold): + """ This program retrieve ID of secreted protein of signalp with cutoff between 0 & 1 """ + spsignalp_id = [] + with open(signalp_file_gff) as f1: + for lignes in f1: + if re.search("^[^#]",lignes): + col = lignes.split("\t") + if float(col[5]) >= treshold: + #print(col[0],col[5]) + spsignalp_id.append(col[0]) + + output_id = open(spsignalp_output, "w") + for elem in spsignalp_id: + output_id.write(elem+"\n") + output_id.close() + +if __name__ == '__main__': + main() + + + diff --git a/scripts/parse_targetp.py b/scripts/parse_targetp.py new file mode 100644 index 0000000000000000000000000000000000000000..c84e064dbf3ff7f56012e46f9cb3ecda5703aef7 --- /dev/null +++ b/scripts/parse_targetp.py @@ -0,0 +1,40 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import pandas as pd +import click + + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--targetp_file', '-t', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to output of targetp2 file') +@click.option('--sptargetp_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to create secreted ID of sorted protein') +@click.option('--treshold', '-th', default=None, + type=click.FloatRange(min=0, max=1, min_open=True, max_open=True), + required=True, show_default=True, help='treshold for protein with signal peptide (between 0 & 1)') + +def main(targetp_file, sptargetp_output, treshold): + """ This program retrieve ID of secreted protein of targetp with cutoff between 0 & 1 """ + sptargetp_id = [] + with open(targetp_file) as f1: + for lignes in f1: + if re.search("^[^#]",lignes): + col = lignes.split("\t") + if col[1] == "SP" and float(col[3])>= treshold: + #print(col[0],col[1],col[3]) + sptargetp_id.append(col[0]) + + output_id = open(sptargetp_output, "w") + for elem in sptargetp_id: + output_id.write(elem+"\n") + output_id.close() + +if __name__ == '__main__': + main() + + + diff --git a/scripts/parse_tmhmm.py b/scripts/parse_tmhmm.py new file mode 100644 index 0000000000000000000000000000000000000000..1d8804784a69d65468c4efd5675d27fd819ac2c2 --- /dev/null +++ b/scripts/parse_tmhmm.py @@ -0,0 +1,36 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import pandas as pd +import click + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--tmhmm_file', '-in', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to input of tmhmm results') +@click.option('--parsetmhmm_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to create secreted ID of sorted protein') +@click.option('--transmembranaire', '-tm', default=None, + type=click.INT, + required=True, show_default=True, help='Number of maxiumum of transmembranaire domain you want to keep') + +def main(tmhmm_file, parsetmhmm_output, transmembranaire): + """ This program parse the output of tool tmhmm with a cutoff and transmembranaire domain (max) """ + tm_parsing = [] + with open(tmhmm_file) as f1: + for lignes in f1: + ligne = lignes.rstrip("\n") + col = ligne.split("\t") + nb_tm = col[4].split("=") + if int(nb_tm[1]) <= transmembranaire: + tm_parsing.append(ligne) + output_tmhmm = open(parsetmhmm_output, "w") + for elem in tm_parsing: + output_tmhmm.write(elem+"\n") + output_tmhmm.close() + +if __name__ == '__main__': + main() + diff --git a/scripts/parse_wolfpsort.py b/scripts/parse_wolfpsort.py new file mode 100644 index 0000000000000000000000000000000000000000..66d814c5812506afc39ba4c556ac5070c865af16 --- /dev/null +++ b/scripts/parse_wolfpsort.py @@ -0,0 +1,38 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import pandas as pd +import click + + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--wolfpsort_file', '-in', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to the wolfpsort file') +@click.option('--spwolfpsort_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to output of wolfpsort parsed') +@click.option('--treshold', '-th', default=None, + type=click.INT, + required=True, show_default=True, help='treshold for score wolfpsort') + +def main(wolfpsort_file, spwolfpsort_output, treshold): + """ This program retrieve ID of secreted protein of wolfpsort with score cutoff wolfpsort """ + wolfpsort_id = [] + with open(wolfpsort_file) as f1: + for lignes in f1: + if re.search("^[^#]",lignes): + good_lignes = re.sub(",","",lignes) + col = good_lignes.split(" ") + if col[1] == "extr" and int(col[2])>= treshold: + print(col[0],col[1],col[2]) + wolfpsort_id.append(col[0]) + + output_id = open(spwolfpsort_output, "w") + for elem in wolfpsort_id: + output_id.write(elem+"\n") + output_id.close() + +if __name__ == '__main__': + main() diff --git a/scripts/rename_prot.py b/scripts/rename_prot.py new file mode 100644 index 0000000000000000000000000000000000000000..a5fcba6395b3fd6c9ddc973a1d71e724b00e5960 --- /dev/null +++ b/scripts/rename_prot.py @@ -0,0 +1,37 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +from Bio.Seq import Seq +import click +import os + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--protein_file', '-p', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to fasta protein') +@click.option('--fasta_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to create fasta sorted protein') +@click.option('--strain_name', '-name', default=None, + type=click.STRING, + required=True, show_default=True, help='Name of the strain') + + + +def main(protein_file, fasta_output, strain_name): + """This programme remove * character on protein seq and rename it""" + # read fasta and save to dict + sorted_prot = [] + for record in SeqIO.parse(protein_file, "fasta"): + record.id = str(strain_name)+"_"+str(record.id) + no_stop = re.sub("\*","",str(record.seq)) + record.seq = Seq(no_stop) + sorted_prot.append(record) + record.description ="" + record.name = "" + + SeqIO.write(sorted_prot,fasta_output,"fasta") + +if __name__ == '__main__': + main() diff --git a/scripts/tmhmm_to_fasta.py b/scripts/tmhmm_to_fasta.py new file mode 100644 index 0000000000000000000000000000000000000000..97f64f2ce33a80d3350d6dbfc5b557d2cd214ff3 --- /dev/null +++ b/scripts/tmhmm_to_fasta.py @@ -0,0 +1,44 @@ +import re +import sys +from collections import defaultdict +from Bio import SeqIO +import click + + +@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800}) +@click.option('--protein_file', '-p', default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to fasta protein sorted with the 4 predicted tools (PredGPI , Phobius, SignalP, TargetP') +@click.option('--tmhmm_parsed', '-tmhmm', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to output file of tmhmm parsed') +@click.option('--fasta_output', '-o', default=None, + type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True), + required=True, show_default=True, help='Path to fasta protein output') + +def main(protein_file, tmhmm_parsed, fasta_output): + """This programme use ID of protein with signal peptide and protein fasta file to generate fasta protein with this ID""" + id_tmhmm = [] + fasta_prot = defaultdict(str) + + with open(tmhmm_parsed, "r") as f1: + for lignes in f1: + without_backspace = lignes.rstrip("\n") + col = without_backspace.split("\t") + id_tmhmm.append(col[0]) + + + for record in SeqIO.parse(protein_file, "fasta"): + fasta_prot[record.id] = record + + fasta_secretion = [] + + for cle in fasta_prot: + for elem in id_tmhmm: + if elem == cle: + fasta_secretion.append(fasta_prot[elem]) + + SeqIO.write(fasta_secretion,fasta_output,"fasta") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/snakefile b/snakefile new file mode 100644 index 0000000000000000000000000000000000000000..8383686dd2beaa298fb63025721117c2d11dae0a --- /dev/null +++ b/snakefile @@ -0,0 +1,620 @@ +configfile: "config/config.yaml" + +fasta_prot_dir = config["DATA"]["PROTEIN"] +output_dir = config["DATA"]["OUTPUT"] +log_dir = f"{output_dir}LOGS/" +script_dir = config["DATA"]["SCRIPTS"] +gff_dir = config["DATA"]["GFF"] + +PROTEIN, = glob_wildcards(fasta_prot_dir+"{samples}.fasta", followlinks=True) + + + + +def get_threads(rule, default): + """ + give threads or 'cpus-per-task from cluster_config rule : threads to SGE and cpus-per-task to SLURM + """ + if cluster_config: + if rule in cluster_config and 'threads' in cluster_config[rule]: + return int(cluster_config[rule]['threads']) + elif rule in cluster_config and 'cpus-per-task' in cluster_config[rule]: + return int(cluster_config[rule]['cpus-per-task']) + elif '__default__' in cluster_config and 'cpus-per-task' in cluster_config['__default__']: + return int(cluster_config['__default__']['cpus-per-task']) + elif '__default__' in cluster_config and 'threads' in cluster_config['__default__']: + return int(cluster_config['__default__']['threads']) + if workflow.global_resources["_cores"]: + return workflow.global_resources["_cores"] + return default + + + +rule finale: + input: + domain_prot = expand(f"{output_dir}3_HMMER_PFAM/{{samples}}_secreted.tbl", samples = PROTEIN), + effector_contig = expand(f"{output_dir}5_FINAL_RESULT/EFFECTOR/{{samples}}/{{samples}}_effector_per_contig.txt", samples = PROTEIN) + + +rule rename_protein: + threads: get_threads("rename_protein",1) + input: + protein = f"{fasta_prot_dir}{{samples}}.fasta" + output: + sorted_protein = f"{output_dir}1_PROTEIN_SORTED/{{samples}}.fasta" + log : + error = f'{log_dir}protein_sorted/protein_sorted_{{samples}}.e', + output = f'{log_dir}protein_sorted/protein_sorted_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Fasta : {{input.protein}} + Output: + - Protein_sorted: {{output.sorted_protein}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}rename_prot.py -p {{input.protein}} -o {{output.sorted_protein}} -name {{wildcards.samples}} 1>{{log.output}} 2>{{log.error}}" + + +rule phobius: + threads: get_threads("phobius",5) + input: + protein = rules.rename_protein.output.sorted_protein + output: + output_phobius = f"{output_dir}2_SECRETED_PROTEIN/PHOBIUS/{{samples}}/{{samples}}_phobius.tsv" + log : + error = f'{log_dir}phobius/phobius_{{samples}}.e', + output = f'{log_dir}phobius/phobius_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Fasta : {{input.protein}} + Output: + - Phobius_TSV: {{output.output_phobius}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + envmodules: + "phobius_local" + shell: + f"phobius.pl -short {{input.protein}} 1>{{output.output_phobius}} 2>{{log.error}}" + +rule signalP: + threads: get_threads("signalP",10) + input: + protein = rules.rename_protein.output.sorted_protein + output: + output_signalP = f"{output_dir}2_SECRETED_PROTEIN/SignalP/{{samples}}/output.gff3" + params: + output_dir_phobius =f"{output_dir}2_SECRETED_PROTEIN/SignalP/{{samples}}" + log : + error = f'{log_dir}signalP/phobius_{{samples}}.e', + output = f'{log_dir}signalP/phobius_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Fasta : {{input.protein}} + Output: + - Phobius_TSV: {{output.output_signalP}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + envmodules: + "signalp/" + shell: + f"signalp6 --fastafile {{input.protein}} --output_dir {{params.output_dir_phobius}} --organism eukarya 1>{{log.output}} 2>{{log.error}}" + f"\nrm -rf {{output_dir}}2_SECRETED_PROTEIN/SignalP/{{wildcards.samples}}/output_*.txt" + +rule targetp: + threads: get_threads("targetp",10) + input: + protein = rules.rename_protein.output.sorted_protein + output: + output_targetp = f"{output_dir}2_SECRETED_PROTEIN/TargetP/{{samples}}/{{samples}}_summary.targetp2" + log : + error = f'{log_dir}targetp/targetp_{{samples}}.e', + output = f'{log_dir}targetp/targetp_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Protein : {{input.protein}} + Output: + - TargetP_summary: {{output.output_targetp}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + envmodules: + "targetp_local" + shell: + f"targetp -fasta {{input.protein}} -stdout 1>{{output.output_targetp}} 2>{{log.error}}" + +rule predgpi: + threads: get_threads("predgpi",10) + input: + protein = rules.rename_protein.output.sorted_protein + output: + output_predgpi = f"{output_dir}2_SECRETED_PROTEIN/PredGPI/{{samples}}/{{samples}}.predgpi" + log: + error = f'{log_dir}predgpi/predgpi_{{samples}}.e', + output = f'{log_dir}predgpi/predgpi_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Protein : {{input.protein}} + Output: + - Predgpi_summary: {{output.output_predgpi}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + envmodules: + "predgpi_local" + shell: + f"predgpi.py -f {{input.protein}} -o {{output.output_predgpi}} 1>{{log.output}} 2>{{log.error}}" + +rule parse_phobius: + threads: get_threads("parse_phobius",1) + input: + result_phobius = rules.phobius.output.output_phobius + output: + secreted_phobius = f"{output_dir}2_SECRETED_PROTEIN/ID_SECRETED/{{samples}}/PHOBIUS/{{samples}}_secretedID.phobius" + log: + error = f'{log_dir}parse_phobius/parse_phobius_{{samples}}.e', + output = f'{log_dir}parse_phobius/parse_phobius_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Results_phobius : {{input.result_phobius}} + Output: + - Parse_phobius: {{output.secreted_phobius}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}parse_phobius.py -t {{input.result_phobius}} -o {{output.secreted_phobius}} 1>{{log.output}} 2>{{log.error}}" + +rule parse_signalp: + threads: get_threads("parse_signalp",1) + input: + result_signalp = rules.signalP.output.output_signalP + output: + secreted_signalp = f"{output_dir}2_SECRETED_PROTEIN/ID_SECRETED/{{samples}}/SIGNALP/{{samples}}_secretedID.signalp" + params: + threshold=config["TOOLS_PARAMS"]["PARSE_SIGNALP_TRESHOLD"] + log: + error = f'{log_dir}parse_signalp/parse_signalp_{{samples}}.e', + output = f'{log_dir}parse_signalp/parse_signalp_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Results_signalp : {{input.result_signalp}} + Output: + - Parse_signalp: {{output.secreted_signalp}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}parse_signalp.py -s {{input.result_signalp}} -o {{output.secreted_signalp}} -th {{params.threshold}} 1>{{log.output}} 2>{{log.error}}" + +rule parse_targetp: + threads: get_threads("parse_targetp",1) + input: + result_targetp = rules.targetp.output.output_targetp + output: + secreted_targetp = f"{output_dir}2_SECRETED_PROTEIN/ID_SECRETED/{{samples}}/TARGETP/{{samples}}_secretedID.targetp" + params: + threshold=config["TOOLS_PARAMS"]["PARSE_TARGETP_TRESHOLD"] + log: + error = f'{log_dir}parse_targetp/parse_targetp_{{samples}}.e', + output = f'{log_dir}parse_targetp/parse_targetp_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Results_targetp : {{input.result_targetp}} + Output: + - Parse_targetp: {{output.secreted_targetp}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}parse_targetp.py -t {{input.result_targetp}} -o {{output.secreted_targetp}} -th {{params.threshold}} 1>{{log.output}} 2>{{log.error}}" + + +rule parse_predgpi: + threads: get_threads("parse_predgpi",1) + input: + result_predgpi = rules.predgpi.output.output_predgpi + output: + noanchor_predgpi = f"{output_dir}2_SECRETED_PROTEIN/ID_SECRETED/{{samples}}/PREDGPI/{{samples}}_noanchorID.predgpi" + params: + threshold=config["TOOLS_PARAMS"]["PARSE_TARGETP_TRESHOLD"] + log: + error = f'{log_dir}parse_predgpi/parse_predgpi_{{samples}}.e', + output = f'{log_dir}parse_predgpi/parse_predgpi_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Result_predgpi : {{input.result_predgpi}} + Output: + - Parse_predgpi: {{output.noanchor_predgpi}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}parse_predgpi.py -p {{input.result_predgpi}} -o {{output.noanchor_predgpi}} 1>{{log.output}} 2>{{log.error}}" + +rule intersect_tools: + threads: get_threads("intersect_tools",1) + input: + result_parse_predgpi = rules.parse_predgpi.output.noanchor_predgpi, + result_parse_targetp = rules.parse_targetp.output.secreted_targetp, + result_parse_signalp = rules.parse_signalp.output.secreted_signalp, + result_parse_phobius = rules.parse_phobius.output.secreted_phobius + output: + signalpeptide_id = f"{output_dir}2_SECRETED_PROTEIN/ID_SECRETED/{{samples}}/{{samples}}_intersect.signalpeptide" + log: + error=f'{log_dir}intersect_tools/intersect_tools_{{samples}}.e', + output=f'{log_dir}intersect_tools/intersect_tools_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Parse_phobius : {{input.result_parse_phobius}} + - Parse_targetp : {{input.result_parse_targetp}} + - Parse_signalp : {{input.result_parse_signalp}} + - Parse_predgpi : {{input.result_parse_predgpi}} + Output: + - Intersect_signalpeptide: {{output.signalpeptide_id}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"bash {script_dir}intersect.sh {{input.result_parse_predgpi}} {{input.result_parse_signalp}} {{input.result_parse_phobius}} {{input.result_parse_targetp}} 1>{{output.signalpeptide_id}} 2>{{log.error}}" + + +rule fasta_intersect: + threads: get_threads("fasta_intersect",1) + input: + fasta_protein = rules.rename_protein.output.sorted_protein, + intersect_id = rules.intersect_tools.output.signalpeptide_id + output: + intersect_fasta_prot = f"{output_dir}2_SECRETED_PROTEIN/ID_SECRETED/{{samples}}/{{samples}}_intersect.fasta" + log: + error=f'{log_dir}fasta_intersect/fasta_intersect_{{samples}}.e', + output=f'{log_dir}fasta_intersect/fasta_intersect_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Fasta_protein : {{input.fasta_protein}} + - Intersect_id : {{input.intersect_id}} + Output: + - Fasta_intersect_protein : {{output.intersect_fasta_prot}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}fasta_intersect.py -p {{input.fasta_protein}} -s {{input.intersect_id}} -o {{output.intersect_fasta_prot}} 1>{{log.output}} 2>{{log.error}}" + +rule tmhmm: + threads: get_threads("tmhmm",5) + input: + fasta_intersect_prot = rules.fasta_intersect.output.intersect_fasta_prot + output: + tmhmm_output = f"{output_dir}2_SECRETED_PROTEIN/TMHMM/{{samples}}/{{samples}}.tmhmm" + log: + error=f'{log_dir}tmhmm/tmhmm_{{samples}}.e', + output=f'{log_dir}tmhmm/tmhmm_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Fasta_protein_intersect : {{input.fasta_intersect_prot}} + Output: + - TMHMM output : {{output.tmhmm_output}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + envmodules: + "tmhmm/2.0c" + shell: + f"tmhmm -short {{input.fasta_intersect_prot}} 1>{{output.tmhmm_output}} 2>{{log.error}}" + +rule parse_tmhmm: + threads: get_threads("parse_tmhmm",1) + input: + tmhmm_outfile = rules.tmhmm.output.tmhmm_output + output: + tmhmm_parsed_file = f"{output_dir}2_SECRETED_PROTEIN/TMHMM/{{samples}}/{{samples}}_tmhmm_parsed.tsv" + params: + threshold=config["TOOLS_PARAMS"]["PARSE_TMHMM_TRESHOLD"] + log: + error=f'{log_dir}parse_tmhmm/parse_tmhmm_{{samples}}.e', + output=f'{log_dir}parse_tmhmm/parse_tmhmm_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - TMHMM output : {{input.tmhmm_outfile}} + Output: + - TMHMM parsed : {{output.tmhmm_parsed_file}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}parse_tmhmm.py -in {{input.tmhmm_outfile}} -tm {{params.threshold}} -o {{output.tmhmm_parsed_file}} 1>{{log.output}} 2>{{log.error}}" + +rule tmhmm_fasta: + threads: get_threads("tmhmm_fasta",1) + input: + tmhmm_parsed = rules.parse_tmhmm.output.tmhmm_parsed_file, + protein_intersected = rules.fasta_intersect.output.intersect_fasta_prot + output: + fasta_parsed = f"{output_dir}2_SECRETED_PROTEIN/TMHMM/{{samples}}/{{samples}}_tmhmm_parsed.fasta" + log: + error=f'{log_dir}tmhmm_fasta/tmhmm_fasta_{{samples}}.e', + output=f'{log_dir}tmhmm_fasta/tmhmm_fasta{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - TMHMM parsed : {{input.tmhmm_parsed}} + - Fasta intersect : {{input.protein_intersected}} + Output: + - Fasta protein : {{output.fasta_parsed}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}tmhmm_to_fasta.py -p {{input.protein_intersected}} -tmhmm {{input.tmhmm_parsed}} -o {{output.fasta_parsed}} 1>{{log.output}} 2>{{log.error}}" + +rule wolfpsort: + threads: get_threads("wolfpsort",10) + input: + protein_tmhmm = rules.tmhmm_fasta.output.fasta_parsed + output: + result_wolfpsort = f"{output_dir}2_SECRETED_PROTEIN/WOLFPSORT/{{samples}}/{{samples}}_wolfpsort.txt" + log: + error=f'{log_dir}wolfpsort/wolfpsort_{{samples}}.e', + output=f'{log_dir}wolfpsort/wolfpsort{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - Fasta TMHMM : {{input.protein_tmhmm}} + Output: + - Result WOLFPSORT : {{output.result_wolfpsort}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + envmodules: + "wolfpsort/0.2" + shell: + f"runWolfPsortSummary fungi < {{input.protein_tmhmm}} 1>{{output.result_wolfpsort}} 2>{{log.error}}" +rule parse_wolfpsort: + threads: get_threads("parse_wolfpsort",1) + input: + wolfpsort_output = rules.wolfpsort.output.result_wolfpsort + output: + id_secreted_prot = f"{output_dir}5_FINAL_RESULT/SECRETED_PROTEIN/{{samples}}/{{samples}}_secreted.id" + params: + threshold=config["TOOLS_PARAMS"]["PARSE_WOLFPOSORT_TRESHOLD"] + log: + error=f'{log_dir}parse_wolfpsort/parse_wolfpsort_{{samples}}.e', + output=f'{log_dir}parse_wolfpsort/parse_wolfpsort{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - WOLFPSORT OUTPUT : {{input.wolfpsort_output}} + Output: + - ID SECRETED PROTEIN : {{output.id_secreted_prot}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + + shell: + f"python {script_dir}parse_wolfpsort.py -in {{input.wolfpsort_output}} -th {{params.threshold}} -o {{output.id_secreted_prot}} 1>{{log.output}} 2>{{log.error}}" + +rule id_tofasta_secreted : + threads: get_threads("id_tofasta_secreted",1) + input: + id_secreted = rules.parse_wolfpsort.output.id_secreted_prot, + fasta_tmhmm = rules.tmhmm_fasta.output.fasta_parsed + output: + fasta_prot_secreted = f"{output_dir}5_FINAL_RESULT/SECRETED_PROTEIN/{{samples}}/{{samples}}_secreted.fasta" + log: + error=f'{log_dir}parse_wolfpsort/parse_wolfpsort_{{samples}}.e', + output=f'{log_dir}parse_wolfpsort/parse_wolfpsort{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - FASTA PROTEIN : {{input.fasta_tmhmm}} + - ID SECRETED : {{input.id_secreted}} + Output: + - FASTA SECRETED PROTEIN : {{output.fasta_prot_secreted}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}id_secreted_to_fasta.py -fasta {{input.fasta_tmhmm}} -id {{input.id_secreted}} -o {{output.fasta_prot_secreted}} 1>{{log.output}} 2>{{log.error}}" + +rule hmmer_pfam : + threads: get_threads("hmmer_pfam", 8) + input: + fasta_secreted = rules.id_tofasta_secreted.output.fasta_prot_secreted, + bdd_pfam = config["DATA"]["BDD_PFAM"] + output: + protein_secreted_domain = f"{output_dir}3_HMMER_PFAM/{{samples}}_secreted.tbl" + params: + param_hmmer = config["TOOLS_PARAMS"]["HMMER"] + log: + error=f'{log_dir}hmmer_pfam/hmmer_pfam_{{samples}}.e', + output=f'{log_dir}hmmer_pfam/hmmer_pfam_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - FASTA PROTEIN : {{input.fasta_secreted}} + - BDD PFAM : {{input.bdd_pfam}} + Output: + - DOMMAINES PROTEINES : {{output.protein_secreted_domain}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + envmodules: + "hmmer/3.2.1" + shell: + f"hmmsearch --tblout {{output.protein_secreted_domain}} {{params.param_hmmer}} {{input.bdd_pfam}} {{input.fasta_secreted}} 1>{{log.output}} 2>{{log.error}}" + +rule effectorP : + threads: get_threads("effectorP", 10) + input: + fasta_secreted = rules.id_tofasta_secreted.output.fasta_prot_secreted + output: + fasta_effectors = f"{output_dir}5_FINAL_RESULT/EFFECTOR/{{samples}}/{{samples}}_effector.fasta", + effectorP_out = f"{output_dir}5_FINAL_RESULT/EFFECTOR/{{samples}}/{{samples}}_effectorP.out", + no_effector_fasta = f"{output_dir}5_FINAL_RESULT/EFFECTOR/{{samples}}/{{samples}}_non_effector.fasta" + log: + error=f'{log_dir}effectorP/effectorP_{{samples}}.e', + output=f'{log_dir}effectorP/effectorP_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - FASTA PROTEIN : {{input.fasta_secreted}} + Output: + - EFFECTOR FASTA : {{output.fasta_effectors}} + - NON EFFECTOR FASTA : {{output.no_effector_fasta}} + - EFFECTORP_OUT : {{output.effectorP_out}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + envmodules: + "effectorp_local" + shell: + f"EffectorP.py -o {{output.effectorP_out}} -E {{output.fasta_effectors}} -N {{output.no_effector_fasta}} -i {{input.fasta_secreted}} 1>{{log.output}} 2>{{log.error}}" + +rule sort_gff: + threads: get_threads("sort_gff",1) + input: + gff_file = f"{gff_dir}{{samples}}.gff3" + output: + gff_sorted = f"{output_dir}4_GFF_SORTED/{{samples}}/{{samples}}_sorted.gff3", + log: + error=f'{log_dir}sort_gff/sort_gff_{{samples}}.e', + output=f'{log_dir}sort_gff/sort_gff_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - GFF FILE : {{input.gff_file}} + Output: + - GFF SORTED : {{output.gff_sorted}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + shell: + f"python {script_dir}gff_sort.py -g {{input.gff_file}} -o {{output.gff_sorted}} -name {{wildcards.samples}} 1>{{log.output}} 2>{{log.error}}" + +rule count_effector: + threads: get_threads("count_effector", 1) + input: + fasta_effectors = rules.effectorP.output.fasta_effectors, + gff_protein = rules.sort_gff.output.gff_sorted + output: + effector_per_contig = f"{output_dir}5_FINAL_RESULT/EFFECTOR/{{samples}}/{{samples}}_effector_per_contig.txt" + log: + error=f'{log_dir}count_effector/count_effector_{{samples}}.e', + output=f'{log_dir}count_effector/count_effector_{{samples}}.o' + message: + f""" + Running {{rule}} + Input: + - FASTA EFFECTOR : {{input.fasta_effectors}} + - GFF SORTED : {{input.gff_protein}} + Output: + - EFFECTOR PER CONTING : {{output.effector_per_contig}} + Others + - Threads : {{threads}} + - LOG error: {{log.error}} + - LOG output: {{log.output}} + + """ + + shell: + """ + python {script_dir}count_effectors.py -g {input.gff_protein} -o {output.effector_per_contig} -fasta {input.fasta_effectors} 1>{log.output} 2>{log.error} + sort -V {output.effector_per_contig} -o {output.effector_per_contig} + """ \ No newline at end of file