Skip to content

Commit 946f2ee

Browse files
committed
included support for gmap-aligned transcripts
1 parent 23f2037 commit 946f2ee

File tree

4 files changed

+48
-9
lines changed

4 files changed

+48
-9
lines changed

sample_data/simple_transcriptome_target/cleanme.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
runMe.sh
1717
Trinity.fasta.gz
1818
Makefile
19+
genome_alignments.gmap.gff3.gz
1920
);
2021

2122

Binary file not shown.

sample_data/simple_transcriptome_target/runMe.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,17 @@
22

33
if [ ! -e Trinity.fasta ]; then
44
gunzip -c Trinity.fasta.gz > Trinity.fasta
5+
gunzip -c genome_alignments.gmap.gff3.gz > genome_alignments.gmap.gff3
56
fi
67

78
../../TransDecoder.LongOrfs -t Trinity.fasta
89

910
../../TransDecoder.Predict -t Trinity.fasta $*
1011

12+
# gmap was used to align the Trinity.fasta transcripts to the genome,
13+
# using the gmap '-f 3' output formatting parameter, generating file 'genome_alignments.gmap.gff3'
14+
15+
../../util/cdna_alignment_orf_to_genome_orf.pl Trinity.fasta.transdecoder.gff3 genome_alignments.gmap.gff3 Trinity.fasta > Trinity.fasta.transdecoder.genome.gff3
16+
17+
1118
exit 0

util/cdna_alignment_orf_to_genome_orf.pl

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
use Data::Dumper;
1212
use Fasta_reader;
1313

14-
my $usage = "\nusage: $0 cdna_orfs.genes.gff3 cdna_genome.alignments.gff3 cdna.fasta\n\n";
14+
my $usage = "\nusage: $0 cdna_orfs.genes.gff3 cdna_genome.alignments.gff3 cdna.fasta [DEBUG]\n\n";
1515

1616
my $cdna_orfs_gff3 = $ARGV[0] or die $usage;
1717
my $cdna_genome_gff3 = $ARGV[1] or die $usage;
@@ -27,22 +27,30 @@
2727

2828
my $WARNING_COUNT = 0; # count those orfs that appear to be on strand opposite from the transcribed strand.
2929

30+
my $DEBUG = $ARGV[3];
31+
3032
main: {
3133

34+
print STDERR "-parsing cdna lengths\n" if $DEBUG;
3235
my %cdna_seq_lengths = &parse_cdna_seq_lengths($cdna_fasta);
3336

3437
my %orf_counter;
3538

39+
print STDERR "-parse transcript alignment info\n" if $DEBUG;
3640
my %cdna_acc_to_transcript_structure = &parse_transcript_alignment_info($cdna_genome_gff3);
3741

3842
## parse ORFs on cDNAs
3943

44+
print STDERR "-index $cdna_orfs_gff3\n" if $DEBUG;
4045
my $gene_obj_indexer_href = {};
4146
## associate gene identifiers with contig id's.
4247
my $contig_to_gene_list_href = &GFF3_utils::index_GFF3_gene_objs($cdna_orfs_gff3, $gene_obj_indexer_href);
4348

4449

4550
my %isolated_gene_id_to_new_genes;
51+
52+
my $count_propagated = 0;
53+
my $total = 0;
4654

4755
foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) {
4856
## $asmbl_id is the actual Transcript identifier from which ORFs were predicted.
@@ -51,9 +59,17 @@
5159

5260
foreach my $gene_id (@gene_ids) { # gene identifiers as given by transdecoder on the transcript sequences
5361
my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
62+
63+
$total++;
5464

55-
my $transcript_struct = $cdna_acc_to_transcript_structure{$asmbl_id} or die "Error, no cdna struct for $asmbl_id";
65+
my $transcript_struct = $cdna_acc_to_transcript_structure{$asmbl_id};
66+
unless ($transcript_struct) {
67+
print STDERR "-WARNING, $asmbl_id has no genome representation... skipping\n";
68+
next;
69+
}
5670

71+
#print "$gene_obj_ref\n";
72+
5773

5874
eval {
5975
my $new_orf_gene = &place_orf_in_cdna_alignment_context($transcript_struct, $gene_obj_ref, \%cdna_seq_lengths);
@@ -76,24 +92,25 @@
7692
$new_orf_gene->{TU_feat_name} = $use_gene_id;
7793
$new_orf_gene->{Model_feat_name} = $gene_obj_ref->{Model_feat_name};
7894

95+
print STDERR "New orf: " . $new_orf_gene->toString() if $DEBUG;
7996
push (@{$isolated_gene_id_to_new_genes{$use_gene_id}}, $new_orf_gene);
80-
97+
98+
$count_propagated++;
8199
}
82100
};
83101

84102
if ($@) {
85103

86-
print STDERR "Error occurred.\n";
104+
print STDERR "Error, $gene_id couldn't be fully propagated:\n";
87105

88106
print STDERR Dumper($transcript_struct);
89107
print STDERR $gene_obj_ref->toString();
90108
print STDERR "$@";
91-
die;
92109
}
93110

94111
}
95112
}
96-
113+
97114
## output results:
98115
foreach my $gene_id (sort keys %isolated_gene_id_to_new_genes) {
99116

@@ -112,7 +129,10 @@
112129

113130
print $parent_gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
114131
}
115-
132+
133+
134+
print STDERR "\n\n\tDone. $count_propagated / $total transcript orfs could be propagated to the genome\n\n";
135+
116136
exit(0);
117137

118138
}
@@ -125,16 +145,19 @@ sub parse_transcript_alignment_info {
125145

126146
open (my $fh, $cdna_align_gff3) or die "Error, cannot open file $cdna_align_gff3";
127147
while (<$fh>) {
148+
if (/^\#/) { next; }
128149
unless (/\w/) { next; }
129150

151+
my $line = $_;
152+
130153
my @x = split(/\t/);
131154
my $contig = $x[0];
132155
my $lend = $x[3];
133156
my $rend = $x[4];
134157
my $orient = $x[6];
135158
my $info = $x[8];
136159

137-
$info =~ /Target=(\S+)/ or die "Error, cannot parse ID from $info";
160+
$info =~ /Target=(\S+)/ or die "Error, cannot parse ID from info [$info] of line $line";
138161
my $asmbl = $1;
139162

140163
my $trans_id = "";
@@ -182,6 +205,8 @@ sub parse_transcript_alignment_info {
182205
sub place_orf_in_cdna_alignment_context {
183206
my ($transcript_struct, $orf_gene_obj, $cdna_seq_lengths_href) = @_;
184207

208+
#print $orf_gene_obj->toString();
209+
185210
my $trans_seq_length = $cdna_seq_lengths_href->{ $transcript_struct->{asmbl} } or confess "Error, no length for " . Dumper($transcript_struct) . " Please be sure to use a cDNA fasta file and not a genome fasta file for your commandline parameter.";
186211

187212

@@ -199,10 +224,16 @@ sub place_orf_in_cdna_alignment_context {
199224
}
200225
}
201226

227+
#print Dumper(\@cds_coords);
228+
202229
@cds_coords = sort {$a->[0]<=>$b->[0]} @cds_coords;
203230

204231
my $cds_span_lend = $cds_coords[0]->[0];
205232
my $cds_span_rend = $cds_coords[$#cds_coords]->[1];
233+
234+
unless ($cds_span_lend && $cds_span_rend) {
235+
die "Error, missing cds span for gene: " . $orf_gene_obj->toString();
236+
}
206237

207238
if ($cds_span_rend > $trans_seq_length) {
208239
$cds_span_rend = $trans_seq_length;
@@ -303,7 +334,7 @@ sub from_cdna_lend {
303334
}
304335

305336

306-
die "Error, couldn't localize pt $pt within coordsets: " . Dumper($coords_aref);
337+
die "Error, couldn't localize pt [$pt] within coordsets: " . Dumper($coords_aref);
307338

308339
return;
309340
}

0 commit comments

Comments
 (0)