-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_isoforms_gtf.pl
executable file
·89 lines (79 loc) · 2.68 KB
/
merge_isoforms_gtf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/perl -s -w
## This script merges all isoforms into one (often non-existing)
## transcript: all exons are strung together into one gene model (which
## may have non-existing exon juctions, and will have existing junctions missing)
## The results is a gtf file with names and coordinates; to create a fasta file
## out of this, use gffread.
## Original written by Dominic Grün
## Note: in the Hg38 RefSeq there are genes with several copies that all have the
## same name (like RNU1-1) (EnsEMBL does this properly, e.g. RN1-1 and
## RNU1-3). The result is that all copies are strung together into one
## supertranscript, *even if they lie on different strands* !!
use tools;
if (!@ARGV && (!$in || !$cl || !$out)) {
die "usage: -in=INPUT.gtf -cl=clusters_list.tsv -out=OUTPUT.gtf\n";
}
## clusters_list.tsv comes from the -m option of ucsc2gtf.pl, is a table (reverse)mapping gene_id to
## transcript_ids
%stop = ();
%strand = ();
%chr=();
open(IN,"<",$cl) || die "$cl: $!";
while(<IN>){
chomp;
($cl_name,$e) = split(/\t/);
@me = split(/\|/,$e);
foreach $m (@me){
$clust{$m} = $cl_name;
}
}
close(IN);
die "$cl: no clusters read " unless int(keys %clust);
open(IN,"<",$in) || die "$in: $!";
while(<IN>){
chomp;
next if $_ =~ /^\#/;
@F = split(/\t/);
next if $F[2] eq "transcript";
$info = $F[$#F];
$info =~ s/(\"|\;)//g;
($dum,$dum,$dum,$tid) = split(/\s+/,$info);
$id = $clust{$tid};
if (!exists($stop{$id}{$F[3]})){
$stop{$id}{$F[3]} = $F[4];
$strand{$id} = $F[6];
$chr{$id} = $F[0];
}else{
$stop{$id}{$F[3]} = $F[4] if $F[4] > $stop{$id}{$F[3]};
}
}
close(IN);
open(OUT,">",$out);
foreach $id (sort keys %stop){
@start = sort {$a <=> $b} keys %{$stop{$id}};
$tmp_stop = -1;
@exon = ();
foreach $i (0..$#start){
$s = $start[$i];
if ($i == 0){
$tmp_min = $s;
$tmp_max = $stop{$id}{$s};
$tmp_start = $s;
$tmp_stop = $stop{$id}{$s};
}else{
$tmp_min = min($tmp_min, $s);
$tmp_max = max($tmp_max,$stop{$id}{$s});
if ($s > $tmp_stop ){
push(@exon,join("\t",($chr{$id},"merge","exon",$tmp_start,$tmp_stop,0,$strand{$id},".","gene_id \"".$id."\"; transcript_id \"".$id."\"; exon \"".$i."\";"))."\n");
$tmp_start = $s;
}
$tmp_stop = max( $tmp_stop, $stop{$id}{$s});
}
}
push(@exon,join("\t",($chr{$id},"merge","exon",$tmp_start,$tmp_stop,0,$strand{$id},".","gene_id \"".$id."\"; transcript_id \"".$id."\"; exon \"".($#start + 1)."\";"))."\n");
print OUT join("\t",($chr{$id},"merge","transcript",$tmp_min,$tmp_max,0,$strand{$id},".","gene_id \"".$id."\"; transcript_id \"".$id."\""))."\n";
foreach $e (@exon){
print OUT $e;
}
}
close(OUT);