Skip to content
Snippets Groups Projects
Commit 8baa5a2c authored by aurore.comte_ird.fr's avatar aurore.comte_ird.fr
Browse files

- add rule star_to_diffex to merge star count file output

- modification of stringtie rules to use hisat2 output and star output
- add in config file the possibility to use the pipeline in one shot or in comparison mode
parent be5f7db5
No related branches found
No related tags found
1 merge request!4Integration of STAR as mapper.
...@@ -8,6 +8,14 @@ DATA: ...@@ -8,6 +8,14 @@ DATA:
output_dir: "RNAJA_OUTPUT" output_dir: "RNAJA_OUTPUT"
PAIRED : true # if true should be _R1 / _R2 PAIRED : true # if true should be _R1 / _R2
MODE:
COMPARISON_MODE: false
ONE_PIPELINE:
# STAR, HISAT2
mapping: "STAR"
# STAR, STRINGTIE
count: "STAR"
PARAMS: PARAMS:
HISAT2: HISAT2:
indexation: indexation:
......
...@@ -8,6 +8,14 @@ DATA: ...@@ -8,6 +8,14 @@ DATA:
output_dir: "RNAJA_OUTPUT" output_dir: "RNAJA_OUTPUT"
PAIRED : true # if true should be _R1 / _R2 PAIRED : true # if true should be _R1 / _R2
MODE:
COMPARISON_MODE: false
ONE_PIPELINE:
# STAR, HISAT2
mapping: "STAR"
# STAR, STRINGTIE
count: "STAR"
PARAMS: PARAMS:
HISAT2: HISAT2:
indexation: indexation:
......
...@@ -8,6 +8,14 @@ DATA: ...@@ -8,6 +8,14 @@ DATA:
output_dir: "RNAJA_OUTPUT" output_dir: "RNAJA_OUTPUT"
PAIRED : false # if true should be _R1 / _R2 PAIRED : false # if true should be _R1 / _R2
MODE:
COMPARISON_MODE: false
ONE_PIPELINE:
# STAR, HISAT2
mapping: "STAR"
# STAR, STRINGTIE
count: "STAR"
PARAMS: PARAMS:
HISAT2: HISAT2:
indexation: indexation:
......
...@@ -40,8 +40,12 @@ if config["DATA"]["PAIRED"]: ...@@ -40,8 +40,12 @@ if config["DATA"]["PAIRED"]:
else: else:
SAMPLE_NAME, = glob_wildcards(f"{fastq_dir}/{{fastq}}{RNAja_obj.fastq_files_ext}") SAMPLE_NAME, = glob_wildcards(f"{fastq_dir}/{{fastq}}{RNAja_obj.fastq_files_ext}")
MAPPERS =["HISAT","STAR"] if config["MODE"]["COMPARISON_MODE"]:
COUNTERS =["STRINGTIE"] MAPPERS =["HISAT2","STAR"]
COUNTERS =["STRINGTIE"]
else:
MAPPERS = config["MODE"]["ONE_PIPELINE"]["mapping"]
COUNTERS = config["MODE"]["ONE_PIPELINE"]["count"]
###TODO: ajouter treatement ###TODO: ajouter treatement
#SAMPLE_NAME = RNAja_obj.samples_names_list #SAMPLE_NAME = RNAja_obj.samples_names_list
...@@ -88,7 +92,8 @@ def final_return(wildcards): ...@@ -88,7 +92,8 @@ def final_return(wildcards):
#"STRINGTIE" : expand(f'{output_dir}/COUNT/STRINGTIE/{{fastq}}.gtf', fastq = SAMPLE_NAME), #"STRINGTIE" : expand(f'{output_dir}/COUNT/STRINGTIE/{{fastq}}.gtf', fastq = SAMPLE_NAME),
#"STRINGTIE_list" : f'{output_dir}/COUNT/STRINGTIE/HISAT_Stringtie_list.txt', #"STRINGTIE_list" : f'{output_dir}/COUNT/STRINGTIE/HISAT_Stringtie_list.txt',
#"gffcompare_stat" : f'{output_dir}/COUNT/STRINGTIE/merged.stats', #"gffcompare_stat" : f'{output_dir}/COUNT/STRINGTIE/merged.stats',
#"gcsv_hisat" : f'{output_dir}/COUNT/STRINGTIE/HISAT_gene_count_matrix.csv', "gene_count" : expand(f'{output_dir}/COUNT/{{counters}}/{{mappers}}_gene_count_matrix.csv', mappers = MAPPERS, counters = COUNTERS),
"gene_count_star": expand(f'{output_dir}/COUNT/STAR/STAR_gene_count_matrix.csv',mappers=MAPPERS,counters=COUNTERS),
#"tcsv_hisat" : f'{output_dir}/COUNT/STRINGTIE/HISAT_transcript_count_matrix.csv', #"tcsv_hisat" : f'{output_dir}/COUNT/STRINGTIE/HISAT_transcript_count_matrix.csv',
#"RNA_diff_exp" : f"{output_dir}/4_DE_analysis/diane_Counts.csv" #"RNA_diff_exp" : f"{output_dir}/4_DE_analysis/diane_Counts.csv"
} }
...@@ -248,11 +253,6 @@ rule star_index: ...@@ -248,11 +253,6 @@ rule star_index:
)1>{log.output} 2>{log.error} )1>{log.output} 2>{log.error}
""" """
#TODO: faire une rule star to diffex :
# 1 - quelle colonne correspond au comptage qu'on doit utiliser?
# 2 - faire une table avec les colonnes de tous les echantillons
rule star_map_count: rule star_map_count:
""" """
This rule launch star for each RNAseq paired-end and single-end samples This rule launch star for each RNAseq paired-end and single-end samples
...@@ -266,7 +266,6 @@ rule star_map_count: ...@@ -266,7 +266,6 @@ rule star_map_count:
out_dir = f"{output_dir}/COUNT/STAR/", out_dir = f"{output_dir}/COUNT/STAR/",
other = config['PARAMS']['STAR']['mapping']["params"], other = config['PARAMS']['STAR']['mapping']["params"],
R2= f"{fastq_dir}/{{fastq}}_R2{RNAja_obj.fastq_files_ext}" if config['DATA']['PAIRED'] else '', R2= f"{fastq_dir}/{{fastq}}_R2{RNAja_obj.fastq_files_ext}" if config['DATA']['PAIRED'] else '',
linkcount=f"{output_dir}/COUNT/STAR/STAR_gene_count_matrix.csv"
output: output:
bam = f"{output_dir}/COUNT/STAR/{{fastq}}Aligned.out.sam", bam = f"{output_dir}/COUNT/STAR/{{fastq}}Aligned.out.sam",
genecount= f"{output_dir}/COUNT/STAR/{{fastq}}ReadsPerGene.out.tab", genecount= f"{output_dir}/COUNT/STAR/{{fastq}}ReadsPerGene.out.tab",
...@@ -299,12 +298,50 @@ rule star_map_count: ...@@ -299,12 +298,50 @@ rule star_map_count:
--quantMode TranscriptomeSAM GeneCounts \ --quantMode TranscriptomeSAM GeneCounts \
--outSAMtype BAM Unsorted --outSAMtype BAM Unsorted
{params.other}; {params.other};
ln -sf {output.genecount} {params.linkcount};
samtools view -@ {threads} -F 4 -bh {output.bam} | samtools sort --write-index -@ {threads} -o {output.bam_sort}; samtools view -@ {threads} -F 4 -bh {output.bam} | samtools sort --write-index -@ {threads} -o {output.bam_sort};
) 1>{log.output} 2>{log.error} ) 1>{log.output} 2>{log.error}
""" """
#TODO: faire une rule star to diffex :
# 1 - quelle colonne correspond au comptage qu'on doit utiliser?
# 2 - faire une table avec les colonnes de tous les echantillons
rule star_to_diffex:
"""
This rule get each star count for each samples and put it in one single file
"""
threads : get_threads('star_to_diffex', 10)
input :
countsfiles = expand(rules.star_map_count.output.genecount, fastq=SAMPLE_NAME)
output:
intermediate_result = temp(f"{output_dir}/COUNT/STAR/tmp.csv"),
stargenecount=f"{output_dir}/COUNT/STAR/STAR_gene_count_matrix.csv"
log :
output = f'{log_dir}/COUNT/STAR/star_to_de.o',
error = f'{log_dir}/COUNT/STAR/star_to_de.e'
singularity:
tools_config["SINGULARITY"]["TOOLS"]
message :
"""
Function :
- This rule get each star count for each samples and put it in one single file
Threads: {threads}
Input :
- files : {input.countsfiles}
Ouput :
- uniq count file : {output.stargenecount}
""" + "*" *100
shell :
"""
(
# retrieve the 4th column of each "ReadsPerGene.out.tab" file + the first column that contains the gene IDs
paste {input.countsfiles} | grep -v "_" | awk '{{printf "%s\t", $1}}{{for (i=4;i<=NF;i+=4) printf "%s\t", $i; printf "\n" }}' > {output.intermediate_result}
# add header: "gene_name" + the name of each of the counts file
sed -e "1igene_name\t$(ls {input.countsfiles} | tr '\n' '\t' | sed 's/ReadsPerGene.out.tab//g')" tmp | cut -f1-7 > {output.stargenecount}
) 1>{log.output} 2>{log.error}
"""
### --------------------- MAPPING STATS ### --------------------- MAPPING STATS
...@@ -366,36 +403,36 @@ rule multiqc: ...@@ -366,36 +403,36 @@ rule multiqc:
rule stringtie_discovery : rule stringtie_discovery :
threads: 4 threads: 4
input: input:
bam_hisat = rules.samtools_stats.input.sorted_bam_file, bam = rules.samtools_stats.input.sorted_bam_file,
gtf = annotation_path gtf = annotation_path
output: output:
hisat_gtf= f'{output_dir}/COUNT/STRINGTIE/{{fastq}}_DIS.gtf', gtf= f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_{{fastq}}_DIS.gtf',
hisat_tsv=f'{output_dir}/COUNT/STRINGTIE/{{fastq}}_DIS.tsv' tsv=f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_{{fastq}}_DIS.tsv'
message: message:
f""" f"""
Execute {{rule}} Execute {{rule}}
Input: Input:
- bam_hisat : {{input.bam_hisat}} - bam : {{input.bam}}
Output: Output:
- bam stats : {{output.hisat_gtf}} - bam stats : {{output.gtf}}
""" """
singularity: singularity:
tools_config["SINGULARITY"]["TOOLS"] tools_config["SINGULARITY"]["TOOLS"]
shell: shell:
""" """
stringtie -p 8 -B -G {input.gtf} -o {output.hisat_gtf} -A {output.hisat_tsv} {input.bam_hisat} stringtie -p 8 -B -G {input.gtf} -o {output.gtf} -A {output.tsv} {input.bam}
""" """
# Create a list of GTF created by Stringtie # Create a list of GTF created by Stringtie
rule stringtie_gtf_list_discovery: rule stringtie_gtf_list_discovery:
threads : 1 threads : 1
input: input:
list_gtf_hisat = expand(rules.stringtie_discovery.output.hisat_gtf, fastq = SAMPLE_NAME), list_gtf = expand(rules.stringtie_discovery.output.gtf, fastq = SAMPLE_NAME, mappers=MAPPERS),
output : output :
gtf_hisat = f'{output_dir}/COUNT/STRINGTIE/HISAT_gtf_list_DIS.txt', gtf_out = f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_gtf_list_DIS.txt',
shell: shell:
""" """
find {input.list_gtf_hisat} >> {output.gtf_hisat} find {input.list_gtf} >> {output.gtf_out}
""" """
# Merge stringtie GTF # Merge stringtie GTF
...@@ -403,9 +440,9 @@ rule merge_stringtie_gtf_discovery: ...@@ -403,9 +440,9 @@ rule merge_stringtie_gtf_discovery:
threads: 4 threads: 4
input: input:
gtf_ref = annotation_path, gtf_ref = annotation_path,
list_gtf_hisat = rules.stringtie_gtf_list_discovery.output.gtf_hisat, list_gtf = rules.stringtie_gtf_list_discovery.output.gtf_out,
output: output:
gtf_merged = f'{output_dir}/COUNT/STRINGTIE/stringtie_merged_DIS.gtf', gtf_merged = f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_stringtie_merged_DIS.gtf',
message: message:
""" """
Execute {rule} Execute {rule}
...@@ -414,7 +451,7 @@ rule merge_stringtie_gtf_discovery: ...@@ -414,7 +451,7 @@ rule merge_stringtie_gtf_discovery:
tools_config["SINGULARITY"]["TOOLS"] tools_config["SINGULARITY"]["TOOLS"]
shell: shell:
""" """
stringtie --merge -p 4 -G {input.gtf_ref} -o {output.gtf_merged} {input.list_gtf_hisat} stringtie --merge -p 4 -G {input.gtf_ref} -o {output.gtf_merged} {input.list_gtf}
""" """
#Create the count table from the BAM files with stringtie #Create the count table from the BAM files with stringtie
...@@ -424,11 +461,11 @@ rule stringtie : ...@@ -424,11 +461,11 @@ rule stringtie :
""" """
threads: 4 threads: 4
input: input:
bam_hisat = rules.samtools_stats.input.sorted_bam_file, bam = rules.samtools_stats.input.sorted_bam_file,
gtf = f"{rules.merge_stringtie_gtf_discovery.output.gtf_merged}" if config["PARAMS"]["STRINGTIE"]["discovery_mode"] else f'{annotation_path}' gtf = f"{rules.merge_stringtie_gtf_discovery.output.gtf_merged}" if config["PARAMS"]["STRINGTIE"]["discovery_mode"] else f'{annotation_path}'
output: output:
hisat_gtf= f'{output_dir}/COUNT/STRINGTIE/{{fastq}}.gtf', gtf= f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_{{fastq}}.gtf',
hisat_tsv = f'{output_dir}/COUNT/STRINGTIE/{{fastq}}.tsv' tsv = f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_{{fastq}}.tsv'
message: message:
""" """
Execute {rule} Execute {rule}
...@@ -437,7 +474,7 @@ rule stringtie : ...@@ -437,7 +474,7 @@ rule stringtie :
tools_config["SINGULARITY"]["TOOLS"] tools_config["SINGULARITY"]["TOOLS"]
shell: shell:
""" """
stringtie -p 8 -e -B -G {input.gtf} -o {output.hisat_gtf} -A {output.hisat_tsv} {input.bam_hisat} stringtie -p 8 -e -B -G {input.gtf} -o {output.gtf} -A {output.tsv} {input.bam}
""" """
...@@ -445,12 +482,12 @@ rule stringtie : ...@@ -445,12 +482,12 @@ rule stringtie :
rule stringtie_gtf_list: rule stringtie_gtf_list:
threads : 1 threads : 1
input: input:
list_gtf_hisat = expand(f'{output_dir}/COUNT/STRINGTIE/{{fastq}}.gtf', fastq = SAMPLE_NAME), list_gtf = expand(f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_{{fastq}}.gtf', fastq = SAMPLE_NAME, mappers=MAPPERS),
output : output :
gtf_hisat = f'{output_dir}/COUNT/STRINGTIE/HISAT_gtf_list.txt', gtf_out = f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_gtf_list.txt',
shell: shell:
""" """
find {input.list_gtf_hisat} >> {output.gtf_hisat} find {input.list_gtf} >> {output.gtf_out}
""" """
# Merge stringtie GTF # Merge stringtie GTF
...@@ -458,9 +495,9 @@ rule merge_stringtie_gtf: ...@@ -458,9 +495,9 @@ rule merge_stringtie_gtf:
threads: 4 threads: 4
input: input:
gtf_ref = annotation_path, gtf_ref = annotation_path,
list_gtf_hisat = rules.stringtie_gtf_list.output.gtf_hisat, list_gtf = rules.stringtie_gtf_list.output.gtf_out,
output: output:
gtf_merged = f'{output_dir}/COUNT/STRINGTIE/stringtie_merged.gtf', gtf_merged = f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_stringtie_merged.gtf',
message: message:
""" """
Execute {rule} Execute {rule}
...@@ -469,35 +506,35 @@ rule merge_stringtie_gtf: ...@@ -469,35 +506,35 @@ rule merge_stringtie_gtf:
tools_config["SINGULARITY"]["TOOLS"] tools_config["SINGULARITY"]["TOOLS"]
shell: shell:
""" """
stringtie --merge -p 4 -G {input.gtf_ref} -o {output.gtf_merged} {input.list_gtf_hisat} stringtie --merge -p 4 -G {input.gtf_ref} -o {output.gtf_merged} {input.list_gtf}
""" """
# Create a list with the ID sample and the path of the stringtie GTF # Create a list with the ID sample and the path of the stringtie GTF
rule list_for_prepDE: rule list_for_prepDE:
input: rules.stringtie_gtf_list.output.gtf_hisat input: rules.stringtie_gtf_list.output.gtf_out
output: output:
hisat_list= f'{output_dir}/COUNT/STRINGTIE/HISAT_Stringtie_list.txt' list= f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_Stringtie_list.txt'
run: run:
hisat_list_name_reads= open(output.hisat_list, "w") list_name_reads= open(output.hisat_list, "w")
hisat_gtf = open(rules.stringtie_gtf_list.output.gtf_hisat, "r") gtf = open(rules.stringtie_gtf_list.output.gtf_out, "r")
for i in range(len(SAMPLE_NAME)): for i in range(len(SAMPLE_NAME)):
name=SAMPLE_NAME[i] name=SAMPLE_NAME[i]
hisat_list_name_reads.write(name + " " + hisat_gtf.readline()) list_name_reads.write(name + " " + gtf.readline())
# Convert the stringtie GTF to a count table # Convert the stringtie GTF to a count table
rule prepDE_stringtie_table: rule prepDE_stringtie_table:
threads : 1 threads : 1
input : input :
mergelist_hisat = rules.list_for_prepDE.output.hisat_list, mergelist = rules.list_for_prepDE.output.list,
output : output :
gcsv_hisat = f'{output_dir}/COUNT/{{mappers}}/{{mappers}}_gene_count_matrix.csv', gcsv = f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_gene_count_matrix.csv',
tcsv_hisat = f'{output_dir}/COUNT/{{mappers}}/{{mappers}}_transcript_count_matrix.csv', tcsv = f'{output_dir}/COUNT/STRINGTIE/{{mappers}}_transcript_count_matrix.csv',
singularity: singularity:
tools_config["SINGULARITY"]["TOOLS"] tools_config["SINGULARITY"]["TOOLS"]
shell: shell:
""" """
prepDE.py -i {input.mergelist_hisat} -t {output.tcsv_hisat} -g {output.gcsv_hisat} prepDE.py -i {input.mergelist} -t {output.tcsv} -g {output.gcsv}
""" """
rule diff_exp_analysis: rule diff_exp_analysis:
...@@ -506,7 +543,7 @@ rule diff_exp_analysis: ...@@ -506,7 +543,7 @@ rule diff_exp_analysis:
""" """
threads: get_threads('diff_exp_analysis',4) threads: get_threads('diff_exp_analysis',4)
input: input:
gcsv_hisat=rules.prepDE_stringtie_table.output.gcsv_hisat, gcsv=rules.prepDE_stringtie_table.output.gcsv,
sample_info=config["DATA"]["sample_info"], sample_info=config["DATA"]["sample_info"],
#de_comparisons_file=config["DATA"]["files"]["de_comparisons_file"], #de_comparisons_file=config["DATA"]["files"]["de_comparisons_file"],
gtf_file=config["DATA"]["annotation"], gtf_file=config["DATA"]["annotation"],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment