-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrixtwo_ext_V2_A.pl
executable file
·295 lines (228 loc) · 13.5 KB
/
matrixtwo_ext_V2_A.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#/usr/bin/perl -w
use strict;
require "/media/daniel/NGS1/RNASeq/find_circ/read_mapping.pl"; # module reading mapping file for additional information- can be ignored when useless
use List::MoreUtils qw(uniq); # used to get to a list of unique samplenames later
############################################################ usage
# perl matrixtwo.pl matrixmaker_outfile.csv matrixtwo_output.tsv
############################################################
# matrixtwo.pl
# - needs an infile -> the correct infile format is made by matrixmaker.pl as ouput, can directly given to matrixtwo
# - adds additional biologic information, but also removes information from the first matrix that is not used in the heatmap that will be created with the output from this script
# - needs output file name , outputs in a .tsv file format
# - will run in the dir where it was started
# - will output errors into ER file : /home/daniel/logfile_auto.log, can be changed
############## example input :
#coordinates strand RefseqID Gene known_circ num_samples_present total_sum_unique_counts qualities present_in_sample sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB sample -unique_count -qualA -qualB
#chr10:102683731-102685776 + NM_001136123 SLF2 hsa_circ_0006654 16 88 ,6;40,40;40,40;5,40;5,6;40,6;40,6;40,40;40,40;5,40;40,40;40,40;40,40;40,40;5,40;40,40;40 -run_hal01_test1a-run_697_r_test1a-run_hal01_r_test1a-run_hal01_r_test1c-run_hal01_test1c-run_hal01_test1d-run_hal01_test1b-run_697_r_test1c-run_hal01_r_test1b-run_697_test1c-run_697_r_test1b-run_697_test1e-run_697_r_test1d-run_hal01_r_test1d-run_697_test1a-run_697_test1d run_hal01_test1a 5 6 40 run_697_r_test1a 7 40 40 run_hal01_r_test1a 7 40 5 run_hal01_r_test1c 7 40 5 run_hal01_test1c 5 6 40 run_hal01_test1d 5 6 40 run_hal01_test1b 5 6 40 run_697_r_test1c 7 40 40 run_hal01_r_test1b 7 40 5 run_697_test1c 3 40 40 run_697_r_test1b 7 40 40 run_697_test1e 3 40 40 run_697_r_test1d 7 40 40 run_hal01_r_test1d 7 40 5 run_697_test1a 3 40 40 run_697_test1d 3 40 40
#chr10:102683734-102685776 + NM_001136123 SLF2 unknown 4 8 ,40;40,40;40,40;40,40;40 -run_697_r_test1a-run_697_r_test1c-run_697_r_test1b-run_697_r_test1d run_hal01_test1a 0 0 0 run_697_r_test1a 2 40 40 run_hal01_r_test1a 0 0 0 run_hal01_r_test1c 0 0 0 run_hal01_test1c 0 0 0 run_hal01_test1d 0 0 0 run_hal01_test1b 0 0 0 run_697_r_test1c 2 40 40 run_hal01_r_test1b 0 0 0 run_697_test1c 0 0 0 run_697_r_test1b 2 40 40 run_697_test1e 0 0 0 run_697_r_test1d 2 40 40 run_hal01_r_test1d 0 0 0 run_697_test1a 0 0 0 run_697_test1d 0 0 0
#
############# example output :
#
#coordinates refseqid gene circn hallm biom_desc run_hal01_test1a run_697_r_test1a run_hal01_r_test1a run_hal01_r_test1c run_hal01_test1c run_hal01_test1d run_hal01_test1b run_697_r_test1c run_hal01_r_test1b run_697_test1c run_697_r_test1b run_697_test1e run_697_r_test1d run_hal01_r_test1d run_697_test1a run_697_test1d
#chr10:102683731-102685776 NM_001136123 SLF2 hsa_circ_0006654 none SMC5-SMC6_complex_localization_factor_2_ 5 7 7 7 5 5 5 7 7 3 7 3 7 7 3 3
#chr10:102683734-102685776 NM_001136123 SLF2 unknown none SMC5-SMC6_complex_localization_factor_2_ 0 2 0 0 0 0 0 2 0 0 2 0 2 0 0 0
#
#
#
#
open(ER,'>>',"/home/daniel/logfile_auto.log")||die "$!"; # global logfile
my $start = time;
my$linfile= $ARGV[0];
chomp $linfile;
print ER "reading input file $linfile ...\n";
# output file second argument adding coordinates
open(IN,$linfile)|| die "$!";
my@allelines= <IN>;
my$outfile=$ARGV[1]; # outfile
chomp $outfile;
open (OUT ,">",$outfile)|| die "$!";
## mapping file with hallmark gene names : beginning of line is hallmark** then website http://www.broadinstitute.org/gsea/msigdb/cards/HALLMARK_CHOLESTEROL_HOMEOSTASIS then gene names
my$hallmark_mapping_file="/media/daniel/NGS1/RNASeq/find_circ/hallmark_genes.tsv"; # unusual mapping file, not one gene per line
open(MA,$hallmark_mapping_file) || die "$!";
# uses subroutine map_file from read_mapping.pl
my%mart_info=map_file("/media/daniel/NGS1/RNASeq/find_circ/mart_export_ensembl_gene_desc.txt",1,2,"\t");
my@mart_infos= keys %mart_info;
########################################################################### gene mapping file reading into hash %mapping
##exapmle line : HALLMARK_TNFA_SIGNALING_VIA_NFKB http://www.broadinstitute.org/gsea/msigdb/cards/HALLMARK_TNFA_SIGNALING_VIA_NFKB JUNB CXCL2 ATF3 NFKBIA TNFAIP3 PTGS2 CXCL1 IER3 CD83 CCL20 CXCL3 MAFF NFKB2 TNFAIP2 HBEGF KLF6 BIRC3 PLAUR ZFP36 ICAM1 JUN EGR3 IL1B BCL2A1 PPP1R15A ZC3H12A SOD2 NR4A2 IL1A RELB TRAF1 BTG2 DUSP1 MAP3K8 ETS2 F3 SDC4 EGR1 IL6 TNF KDM6B NFKB1 LIF PTX3 FOSL1 NR4A1 JAG1 CCL4 GCH1 CCL2 RCAN1 DUSP2 EHD1 IER2 REL CFLAR RIPK2 NFKBIE NR4A3 PHLDA1 #IER5 TNFSF9 GEM GADD45A CXCL10 PLK2 BHLHE40 EGR2 SOCS3 SLC2A6 PTGER4 DUSP5 SERPINB2 NFIL3 SERPINE1 TRIB1 TIPARP RELA BIRC2 CXCL6 LITAF TNFAIP6 CD44 INHBA PLAU MYC TNFRSF9 SGK1 TNIP1 NAMPT FOSL2 PNRC1 ID2 CD69 IL7R EFNA1 PHLDA2 PFKFB3 CCL5 YRDC IFNGR2 SQSTM1 BTG3 GADD45B KYNU G0S2 BTG1 MCL1 VEGFA MAP2K3 CDKN1A CYR61 TANK IFIT2 IL18 TUBB2A IRF1 FOS OLR1 RHOB AREG NINJ1 ZBTB10 PPAP2B KLF4 CXCL11 SAT1 CSF1 GPR183 PMEPA1 PTPRE TLR2 CXCR7 KLF10 MARCKS #LAMB3 CEBPB TRIP10 F2RL1 KLF9 LDLR TGIF1 RNF19B DRAM1 B4GALT1 DNAJB4 CSF2 PDE4B
my@allemappings= <MA>;
my%mapping_hash=(); # mapping hash: gene name is key, hallmark mechanism is value?
# each line now one array part
#print "reading gene mapping...\n";
foreach my $mapline (@allemappings){
# fill a hash that is used later
chomp $mapline;
my@mappingline_parts=split(/\s+/,$mapline);
my$hallmarktype_full=shift @mappingline_parts;# getting hallmark description properly with some regex cleaning by shifting first array index
$hallmarktype_full=~s/HALLMARK//;
$hallmarktype_full=~s/\s+//;
my$address= shift @mappingline_parts;
#my$allhallmarkg= join(@mappingline_parts,"_");
#now a string of gene1_genetwo_gene3
# rest of the line is all hallmark genes, need to be cleaned up before saving
foreach my $hallmg (@mappingline_parts){
$hallmg=~s/\s+//;
if($hallmg=~/[A-Z]/){ # checking for empty lines and gene names
$mapping_hash{"$hallmg"}="$hallmarktype_full";
#print "mapping _:$hallmg:_ to type $hallmarktype_full g ---\n";
}
# print "mapping $hallmg to type $hallmarktype_full g ---\n";
}
# $mapping_hash{"$allhallmarkg"}="$hallmarktype_full"# gene string is key, hallmark type is value
}
my@allehallmarkg=keys %mapping_hash;
my@all_hm_genes= values %mapping_hash;
############################################################################
# arrays in use
my@sampleuniqc=();# positions of unique count columns
my@samplenames=();# names of all detected samples
my@uniqcounts=(); # where maybe all unique counts will be added into a two-dimensional array?
my@headers=(); # headers with relevant information for each circ candidates
my@alluniques=();# empty yet
# additional information from circBank: coding potential + mirRNA interaction sites
# mapping those files
# example line mirRNA file:
#id circRNA_id chr start end strand spliced_seq_length annotation best_transcript gene_symbol mm9_circRNA_id
#hsa_circA1CF_001 hsa_circ_0018410 chr10 52575765 52623840 - 1470 ANNOTATED, CDS, coding, INTERNAL, OVCODE, OVERLAPTX, OVEXON, UTR5 NM_001198819 A1CF #N/A
#hsa_circA1CF_002 hsa_circ_0018409 chr10 52559168 52573822 - 7964 ANNOTATED, CDS, coding, OVCODE, OVERLAPTX, OVEXON, UTR3 NM_001198819 A1CF #N/A
#hsa_circA2ML1_001 hsa_circ_0025378 chr12 8976315 8988935 + 482 ANNOTATED, CDS, coding, INTERNAL, OVCODE, OVERLAPTX, OVEXON NM_144670 A2ML1 #N/A
#hsa_circA2ML1_002 hsa_circ_0025379 chr12 8990035 8998818 + 955 ANNOTATED, CDS, coding, INTERNAL, OVCODE, OVEXON NM_144670 A2ML1 #N/A
#hsa_circA2ML1_003 hsa_circ_0025380 chr12 8993964 9001510 + 948 ANNOTATED, CDS, coding, INTERNAL, OVCODE, OVEXON NM_144670 A2ML1 #N/A
# since there is no real interaction prediction, we here do the circbank coordinates to id conversion
# in the second file we can then get from id to coding probability
my$circbank_mirRNA_file="/media/daniel/NGS1/RNASeq/find_circ/auto_find_circ/miRNA_circRNA_ineractions.txt"; # unusual mapping file, not one gene per line
open(MI,$circbank_mirRNA_file) || die "$!";
my@allintermirrna=<MI>;
my%coords_to_circ_hash=();
my%coords_length_mouse_info_hash=();
#my%coords_mouse_cons=()
foreach my $mir_cb_line (@allintermirrna){
my$mouse=0;
my$full_mouse_coords="Na";
my@parts_filem=split(/\s+/,$mir_cb_line);
# build into coordinates
my$chromosome=$parts_filem[2];
my$strt=$parts_filem[3];
my$nd=$parts_filem[4];
my$strnd_map_circbank=$parts_filem[5]; # for later checking the strand
my$spliced_length=$parts_filem[6];# maybe use later
my$full_cordss="$chromosome:$strt-$nd";# this can be key or value now
my$circ_name_conv=$parts_filem[1];
$coords_to_circ_hash{"$circ_name_conv"}="$full_cordss";# circ name is key, coords are value
my$whole_line=join(/_/,@parts_filem);
# mouse circRNA detection
if($whole_line=~/mmu_circ/){
my$mouse_cords=$';
$full_mouse_coords="mmu_circ_$mouse_cords"
}
# key: coordinates , value= spliced_length and mouse coords if found
$coords_length_mouse_info_hash{"$full_cordss"}="$spliced_length\t$full_mouse_coords";
}
# now translate that into coding potential
my$circbank_coding_file="/media/daniel/NGS1/RNASeq/find_circ/auto_find_circ/circRNA_protein_coding_potential.txt"; # unusual mapping file, not one gene per line
open(CO,$circbank_coding_file) || die "$!";
my@allincoding=<CO>;
my%circ_to_prob_hash=();
foreach my $codingcb_line (@allincoding){
my@parts_fileco=split(/\s+/,$codingcb_line);
my$circ_name=$parts_fileco[0];
my$coding_prob=$parts_fileco[5];
$circ_to_prob_hash{"$circ_name"}=$coding_prob;
}
# default non-matching descriptions
my$marti="NaN";
my$hallm="none\t";
# starting to read inputfile line by line
for (my $var = 0; $var < scalar(@allelines); $var++) {
my$longline=$allelines[$var];
if (!($longline=~/coordinates/g)) { # ignoring header
# getting relevant information for each circ candidate ...
my@lineparts=split(/\t/,$longline);
my$coords=$lineparts[0];
my$strand=$lineparts[1];
my$refseqID=$lineparts[2];
my$gene=$lineparts[3];
my$circn=$lineparts[4];
# adding hallmark gene type
# print "refseqid is $refseqID\n";
if(grep(/$gene/,@allehallmarkg)){ # get all samplenames into @allenames)
# print "looking for $gene in gene mapping ---\n";
# find hallmark class and add to matrix file
if($mapping_hash{$gene}=~/[A-Z]/){
$hallm=$mapping_hash{$gene};
}
}
else{
$hallm="none";
}
if(grep(/$gene?/,@mart_infos)){ # mart mapping
# gene has information available to it
$marti=$mart_info{$gene};
$marti=~s/\[.*\]//g;
$marti=~s/\ /_/g;
}
else{
$marti="NaN";
}
# check for empty mart information
if(!($marti=~/[A-z]/gi)){
$marti="NaN";
}
# integration of circbank data
my$prob_coding="nA";
if($circ_to_prob_hash{"$circn"}=~/[0-9]/){# get the coding probability based on hssa_circ_name in both files
$prob_coding=$circ_to_prob_hash{"$circn"};
}
# mouse and length data
my$length_and_mouse_circ="nA\tnone";
if($coords_length_mouse_info_hash{"$coords"}=~/\W+/){
$length_and_mouse_circ=$coords_length_mouse_info_hash{"$coords"};
}
push(@headers,"$coords\t$strand\t$refseqID\t$gene\t$circn\t$hallm\t$marti\t$length_and_mouse_circ\t$prob_coding");# header into array
my$e=0;
my$allthings="";
foreach my$samplepos (@sampleuniqc){
$e++; # second coordinate for two-dimensional array of all unique counts
my$samplename = $lineparts[$samplepos-1];
push(@uniqcounts,$lineparts[$samplepos]); #the unique
# print "found a sample= $samplename \nand its counts are $lineparts[$samplepos] circrna of interest is $lineparts[4]\n";
push (@samplenames,$samplename);
## do the magic and find all unique counts for each sample for each circrna candindate, get all this into a string and then print all that out later
$allthings="$allthings\t$lineparts[$samplepos]";
#$alluniques[$var][$e]=$lineparts[$samplepos];# two-dimensional array
}
push(@alluniques,$allthings);# all unique count positions in .mat1 file
}
else{
# header bar only- catch columns with samplenames in it and their position
my@wholeheader=split(/\t/,$longline);
my $i=0;
foreach my $headername (@wholeheader){
$i++;
if ($headername=~/sample/) {
if(!($headername=~/\_sample/)){
push (@sampleuniqc,$i); #get positions of sample ids
# body...
# print "header position is $i in $headername\n";
}
}
}
}
}
# actual file creation:
# header line in output file ...
my@uniques= uniq @samplenames;
print OUT"coordinates\tstrand\trefseqid\tgene\tcircn\thallm\tbiom_desc\tspliced_length\tmm9_circ\tprob_coding\t";
foreach my $sampl (@uniques){
print OUT"$sampl\t";
}
print OUT "\n";
# now the real content, cleaning it from junk and then printing it
for (my $v = 0; $v < scalar(@headers); $v++) {
my$outline="$headers[$v]$alluniques[$v]";
# messy whitespace cleanup
$outline=~s/\t\t+/\t/g;
$outline=~s/\t\s+/\t/g;
$outline=~s/\s+\t/\t/g;
#print "$headers[$v]\t";
print OUT "$outline\n";
}