Skip to content

Commit b308f41

Browse files
committed
added -M option to put in N records for filtered out reads, changed -n to truncate at Ns rather than discard, lots of code refactoring, changed sickle xml for -M option
1 parent dbff68b commit b308f41

File tree

9 files changed

+318
-230
lines changed

9 files changed

+318
-230
lines changed

Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
PROGRAM_NAME = sickle
2-
VERSION = 1.31
2+
VERSION = 1.33
33
CC = gcc
44
CFLAGS = -Wall -pedantic -DVERSION=$(VERSION)
55
DEBUG = -g
@@ -25,6 +25,9 @@ trim_paired.o: $(SDIR)/trim_paired.c $(SDIR)/sickle.h $(SDIR)/kseq.h
2525
sickle.o: $(SDIR)/sickle.c $(SDIR)/sickle.h
2626
$(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c
2727

28+
print_record.o: $(SDIR)/print_record.c $(SDIR)/print_record.h
29+
$(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c
30+
2831
clean:
2932
rm -rf *.o $(SDIR)/*.gch ./sickle
3033

@@ -34,7 +37,7 @@ distclean: clean
3437
dist:
3538
tar -zcf $(ARCHIVE).tar.gz src Makefile
3639

37-
build: sliding.o trim_single.o trim_paired.o sickle.o
40+
build: sliding.o trim_single.o trim_paired.o sickle.o print_record.o
3841
$(CC) $(CFLAGS) $(LDFLAGS) $(OPT) $? -o sickle $(LIBS)
3942

4043
debug:

README.md

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@ in the window drops below the threshold, the algorithm determines
2323
where in the window the drop occurs and cuts both the read and quality
2424
strings there for the 3'-end cut. However, if the length of the
2525
remaining sequence is less than the minimum length threshold, then the
26-
read is discarded entirely. 5'-end trimming can be disabled.
26+
read is discarded entirely (or replaced with an "N" record). 5'-end
27+
trimming can be disabled.
2728

28-
Sickle also has an option to discard reads with any Ns in them.
29+
Sickle also has an option to truncate reads with Ns at the first N position.
2930

3031
Sickle supports three types of quality values: Illumina, Solexa, and
3132
Sanger. Note that the Solexa quality setting is an approximation (the
@@ -48,7 +49,7 @@ local [Galaxy](http://galaxy.psu.edu/) server.
4849
Sickle doesn't have a paper, but you can cite it like this:
4950

5051
Joshi NA, Fass JN. (2011). Sickle: A sliding-window, adaptive, quality-based trimming tool for FastQ files
51-
(Version 1.31) [Software]. Available at https://github.com/najoshi/sickle.
52+
(Version 1.33) [Software]. Available at https://github.com/najoshi/sickle.
5253

5354
## Requirements
5455

@@ -86,7 +87,7 @@ specific to those commands:
8687
`sickle se` takes an input fastq file and outputs a trimmed version of
8788
that file. It also has options to change the length and quality
8889
thresholds for trimming, as well as disabling 5'-trimming and enabling
89-
removal of sequences with Ns.
90+
truncation of sequences with Ns.
9091

9192
#### Examples
9293

@@ -104,9 +105,12 @@ combined input file of reads where you have already interleaved the
104105
reads from the sequencer. In this form, you also supply a single
105106
output file name as well as a "singles" file. The "singles" file
106107
contains reads that passed filter in either the forward or reverse
107-
direction, but not the other. You can also change the length and
108-
quality thresholds for trimming, as well as disable 5'-trimming and
109-
enable removal of sequences with Ns.
108+
direction, but not the other. Finally, there is an option to only
109+
produce one interleaved output file where any reads that did not pass
110+
filter will be output as a FastQ record with a single "N", thus
111+
preserving the paired nature of the data. You can also
112+
change the length and quality thresholds for trimming, as well as
113+
disable 5'-trimming and enable truncation of sequences with Ns.
110114

111115
#### Examples
112116

@@ -128,3 +132,5 @@ enable removal of sequences with Ns.
128132
sickle pe -t sanger -g -f input_file1.fastq -r input_file2.fastq \
129133
-o trimmed_output_file1.fastq.gz -p trimmed_output_file2.fastq.gz \
130134
-s trimmed_singles_file.fastq.gz
135+
136+
sickle pe -c combo.fastq -t sanger -M combo_trimmed_all.fastq

sickle.xml

Lines changed: 65 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,11 @@
2020
#end if
2121

2222
#if str($readtype.single_or_paired) == "pe_combo":
23-
pe -c $input_combo -m $output_combo -s $output_combo_single
23+
#if $readtype.output_n:
24+
pe -c $input_combo -M $output_combo
25+
#else
26+
pe -c $input_combo -m $output_combo -s $output_combo_single
27+
#end if
2428

2529
#if $input_combo.ext == "fastq":
2630
-t sanger
@@ -61,7 +65,7 @@
6165
-x
6266
#end if
6367

64-
#if $discard_n:
68+
#if $trunc_n:
6569
-n
6670
#end if
6771

@@ -73,7 +77,7 @@
7377
<param name="single_or_paired" type="select" optional="false" label="Single-End or Paired-End reads?" help="Note: Sickle will infer the quality type of the file from its datatype. I.e., if the datatype is fastqsanger, then the quality type is sanger. The default is fastqsanger.">
7478
<option value="se" selected="true">Single-End</option>
7579
<option value="pe_combo">Paired-End (one interleaved input file)</option>
76-
<option value="pe_sep">Paired-End (two separate input files)</option>
80+
<option value="pe_sep">Paired-End (two separate input files)</option>
7781
</param>
7882

7983
<when value="se">
@@ -82,6 +86,7 @@
8286

8387
<when value="pe_combo">
8488
<param format="fastq, fastqsanger, fastqillumina, fastqsolexa" name="input_combo" type="data" optional="false" label="Paired-End Interleaved FastQ Reads"/>
89+
<param name="output_n" type="boolean" label="Output only one file with all reads" help="This will output only one file with all the reads, where the reads that did not pass filter will be replaced with a single 'N'"/>
8590
</when>
8691

8792
<when value="pe_sep">
@@ -99,7 +104,7 @@
99104
</param>
100105

101106
<param name="no_five_prime" type="boolean" label="Don't do 5' trimming"/>
102-
<param name="discard_n" type="boolean" label="Discard sequences with Ns"/>
107+
<param name="trunc_n" type="boolean" label="Truncate sequences with Ns at first N position"/>
103108
</inputs>
104109

105110
<outputs>
@@ -113,6 +118,7 @@
113118

114119
<data format_source="input_combo" name="output_combo_single" label="Singletons from Paired-End interleaved output of ${tool.name} on ${on_string}">
115120
<filter>(readtype['single_or_paired'] == 'pe_combo')</filter>
121+
<filter>(readtype['output_n'] == False)</filter>
116122
</data>
117123

118124
<data format_source="input_paired1" name="output_paired1" label="Paired-End forward strand output of ${tool.name} on ${on_string}">
@@ -129,22 +135,71 @@
129135
</outputs>
130136

131137
<help>
132-
Most modern sequencing technologies produce reads that have deteriorating quality towards the 3'-end and some towards the 5'-end as well. Incorrectly called bases in both regions negatively impact assembles, mapping, and downstream bioinformatics analyses.
138+
**Sickle - A windowed adaptive trimming tool for FASTQ files using quality**
139+
140+
.. class:: infomark
141+
142+
**About**
143+
144+
Most modern sequencing technologies produce reads that have
145+
deteriorating quality towards the 3'-end and some towards the 5'-end
146+
as well. Incorrectly called bases in both regions negatively impact
147+
assembles, mapping, and downstream bioinformatics analyses.
148+
149+
Sickle is a tool that uses sliding windows along with quality and
150+
length thresholds to determine when quality is sufficiently low to
151+
trim the 3'-end of reads and also determines when the quality is
152+
sufficiently high enough to trim the 5'-end of reads. It will also
153+
discard reads based upon the length threshold. It takes the quality
154+
values and slides a window across them whose length is 0.1 times the
155+
length of the read. If this length is less than 1, then the window is
156+
set to be equal to the length of the read. Otherwise, the window
157+
slides along the quality values until the average quality in the
158+
window rises above the threshold, at which point the algorithm
159+
determines where within the window the rise occurs and cuts the read
160+
and quality there for the 5'-end cut. Then when the average quality
161+
in the window drops below the threshold, the algorithm determines
162+
where in the window the drop occurs and cuts both the read and quality
163+
strings there for the 3'-end cut. However, if the length of the
164+
remaining sequence is less than the minimum length threshold, then the
165+
read is discarded entirely (or replaced with an "N" record). 5'-end
166+
trimming can be disabled.
167+
168+
Sickle also has an option to truncate reads with Ns at the first N position.
169+
170+
Sickle supports three types of quality values: Illumina, Solexa, and
171+
Sanger. Note that the Solexa quality setting is an approximation (the
172+
actual conversion is a non-linear transformation). The end
173+
approximation is close. Illumina quality refers to qualities encoded
174+
with the CASAVA pipeline between versions 1.3 and 1.7. Illumina
175+
quality using CASAVA >= 1.8 is Sanger encoded.
133176

134-
Sickle is a tool that uses sliding windows along with quality and length thresholds to determine when quality is sufficiently low to trim the 3'-end of reads and also determines when the quality is sufficiently high enough to trim the 5'-end of reads. It will also discard reads based upon the length threshold. It takes the quality values and slides a window across them whose length is 0.1 times the length of the read. If this length is less than 1, then the window is set to be equal to the length of the read. Otherwise, the window slides along the quality values until the average quality in the window rises above the threshold, at which point the algorithm determines where within the window the rise occurs and cuts the read and quality there for the 5'-end cut. Then when the average quality in the window drops below the threshold, the algorithm determines where in the window the drop occurs and cuts both the read and quality strings there for the 3'-end cut. However, if the length of the remaining sequence is less than the minimum length threshold, then the read is discarded entirely. 5'-end trimming can be disabled.
177+
Note that Sickle will remove the 2nd fastq record header (on the "+"
178+
line) and replace it with simply a "+". This is the default format for
179+
CASAVA >= 1.8.
135180

136-
Sickle also has an option to discard reads with any Ns in them.
181+
Sickle also supports gzipped file inputs and optional gzipped outputs. By default,
182+
Sickle will produce regular (i.e. not gzipped) output, regardless of the input.
137183

138-
Sickle supports three types of quality values: Illumina, Solexa, and Sanger. Note that the Solexa quality setting is an approximation (the actual conversion is a non-linear transformation). The end approximation is close. Illumina quality refers to qualities encoded with the CASAVA pipeline between versions 1.3 and 1.7. Illumina quality using CASAVA >= 1.8 is Sanger encoded. Sickle will get the quality type from the datatype of the file.
184+
.. class:: infomark
139185

140-
Note that Sickle will remove the 2nd fastq record header (on the "+" line) and replace it with simply a "+". This is the default format for CASAVA >= 1.8.
186+
**Citation**
141187

142-
Sickle also supports gzipped file inputs.
188+
Sickle doesn't have a paper, but you can cite it like this::
189+
190+
Joshi NA, Fass JN. (2011). Sickle: A sliding-window, adaptive, quality-based trimming tool for FastQ files
191+
(Version 1.33) [Software]. Available at https://github.com/najoshi/sickle.
192+
193+
-----
143194

144195
Copyright: Nikhil Joshi
196+
145197
http://bioinformatics.ucdavis.edu
198+
146199
http://github.com/ucdavis-bioinformatics
200+
147201
http://github.com/najoshi
202+
148203
</help>
149204

150205
</tool>

src/print_record.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#include <assert.h>
2+
#include <ctype.h>
3+
#include <stdlib.h>
4+
#include <zlib.h>
5+
#include <stdio.h>
6+
#include <unistd.h>
7+
#include "sickle.h"
8+
#include "kseq.h"
9+
10+
11+
void print_record (FILE *fp, kseq_t *fqr, cutsites *cs) {
12+
fprintf(fp, "@%s", fqr->name.s);
13+
if (fqr->comment.l) fprintf(fp, " %s\n", fqr->comment.s);
14+
else fprintf(fp, "\n");
15+
fprintf(fp, "%.*s\n", cs->three_prime_cut - cs->five_prime_cut, fqr->seq.s + cs->five_prime_cut);
16+
fprintf(fp, "+\n");
17+
fprintf(fp, "%.*s\n", cs->three_prime_cut - cs->five_prime_cut, fqr->qual.s + cs->five_prime_cut);
18+
}
19+
20+
void print_record_gzip (gzFile fp, kseq_t *fqr, cutsites *cs) {
21+
gzprintf(fp, "@%s", fqr->name.s);
22+
if (fqr->comment.l) gzprintf(fp, " %s\n", fqr->comment.s);
23+
else gzprintf(fp, "\n");
24+
gzprintf(fp, "%.*s\n", cs->three_prime_cut - cs->five_prime_cut, fqr->seq.s + cs->five_prime_cut);
25+
gzprintf(fp, "+\n");
26+
gzprintf(fp, "%.*s\n", cs->three_prime_cut - cs->five_prime_cut, fqr->qual.s + cs->five_prime_cut);
27+
}
28+
29+
void print_record_N (FILE *fp, kseq_t *fqr, int qualtype) {
30+
fprintf(fp, "@%s", fqr->name.s);
31+
if (fqr->comment.l) fprintf(fp, " %s\n", fqr->comment.s);
32+
else fprintf(fp, "\n");
33+
fprintf(fp, "N\n+\n%c\n", quality_constants[qualtype][Q_MIN]);
34+
}
35+
36+
void print_record_N_gzip (gzFile fp, kseq_t *fqr, int qualtype) {
37+
gzprintf(fp, "@%s", fqr->name.s);
38+
if (fqr->comment.l) gzprintf(fp, " %s\n", fqr->comment.s);
39+
else gzprintf(fp, "\n");
40+
gzprintf(fp, "N\n+\n%c\n", quality_constants[qualtype][Q_MIN]);
41+
}
42+

src/print_record.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#ifndef PRINT_RECORD_H
2+
#define PRINT_RECORD_H
3+
4+
#include <stdio.h>
5+
#include <zlib.h>
6+
#include "kseq.h"
7+
8+
void print_record (FILE *fp, kseq_t *fqr, cutsites *cs);
9+
void print_record_gzip (gzFile fp, kseq_t *fqr, cutsites *cs);
10+
void print_record_N (FILE *fp, kseq_t *fqr, int qualtype);
11+
void print_record_N_gzip (gzFile fp, kseq_t *fqr, int qualtype);
12+
13+
#endif /* PRINT_RECORD_H */

src/sickle.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ enum {
4848
"version", no_argument, NULL, GETOPT_VERSION_CHAR
4949
#define case_GETOPT_HELP_CHAR(Usage_call) \
5050
case GETOPT_HELP_CHAR: \
51-
Usage_call(EXIT_SUCCESS); \
51+
Usage_call(EXIT_SUCCESS, NULL); \
5252
break;
5353
#define case_GETOPT_VERSION_CHAR(Program_name, Version, Authors) \
5454
case GETOPT_VERSION_CHAR: \

src/sliding.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) {
3232
}
3333

3434

35-
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int discard_n) {
35+
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n) {
3636

3737
int window_size = (int) (0.1 * fqrec->seq.l);
3838
int i,j;
@@ -43,11 +43,10 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int
4343
int found_five_prime = 0;
4444
double window_avg;
4545
cutsites* retvals;
46+
char *npos;
4647

47-
/* If the sequence contains an "N" then discard if the option has been selected */
48-
/* Also discard if the length of the sequence is less than the length threshold */
49-
if ((discard_n && (strstr(fqrec->seq.s, "N") || strstr(fqrec->seq.s, "n"))) ||
50-
(fqrec->seq.l < length_threshold)) {
48+
/* discard if the length of the sequence is less than the length threshold */
49+
if (fqrec->seq.l < length_threshold) {
5150
retvals = (cutsites*) malloc (sizeof(cutsites));
5251
retvals->three_prime_cut = -1;
5352
retvals->five_prime_cut = -1;
@@ -107,6 +106,12 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int
107106
}
108107

109108

109+
/* If truncate N option is selected, and sequence has Ns, then */
110+
/* change 3' cut site to be the base before the first N */
111+
if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) {
112+
three_prime_cut = npos - fqrec->seq.s;
113+
}
114+
110115
/* if cutting length is less than threshold then return -1 for both */
111116
/* to indicate that the read should be discarded */
112117
/* Also, if you never find a five prime cut site, then discard whole read */

0 commit comments

Comments
 (0)