-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_genes_database.sh
executable file
·161 lines (130 loc) · 5.91 KB
/
create_genes_database.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env bash
create_genes_database () {
if [ ${#@} -ne 3 ]; then
printf 'Wrong number of input arguments.\n\n';
printf 'Usage:\n bash create_genes_database.sh <refgene_table_filename|gene_pred_filename> <genome_2bit_filename> <gene_sqlite3_database_filename>\n\n';
printf 'Arguments:\n';
printf ' - refgene_table_filename:\n';
printf ' Download premade refGene files from:\n';
printf ' ftp://hgdownload.cse.ucsc.edu/goldenPath/<assembly>/database/refGene.txt.gz\n';
printf ' - gene_pred_filename:\n';
printf ' Create Gene Predictions (Extended) file from GTF/GFF with:\n';
printf ' - gtfToGenePred -genePredExt genes.gtf /dev/stdout | gzip -c > genes.txt.gz\n';
printf ' (http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred)\n';
printf ' - gtf2tbl.py -i genes.gtf -o /dev/stdout | gzip -c > genes.txt.gz\n';
printf ' - gff2tbl.py -i genes.gff -o /dev/stdout | gzip -c > genes.txt.gz\n';
printf ' - genome_2bit_filename:\n';
printf ' Download genome 2bit files from:\n';
printf ' ftp://hgdownload.cse.ucsc.edu/goldenPath/<assembly>/bigZips/<assembly>.2bit\n';
printf ' Create genome 2bit files from FASTA files with:\n';
printf ' faToTwoBit genome.fa genome.2bit \n';
printf ' (http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/faToTwoBit)\n';
printf ' - gene_sqlite3_database_filename:\n';
printf ' Output gene SQLite3 database filename.\n\n';
return 2;
fi
local gene_pred_filename="${1}";
local genome_2bit_filename="${2}";
local gene_sqlite3_database_filename="${3}";
# Check if all input files exist and have the right extension,
# so there might be a high chance that the correct input files are used.
if [ ! -f "${gene_pred_filename}" ] ; then
printf 'ERROR: GenePred file or RefGene Table file "%s" could not be found.\n' "${gene_pred_filename}";
return 1;
fi
if [ "${gene_pred_filename}" = "${gene_pred_filename%.txt.gz}" ] ; then
printf 'ERROR: GenePred file or RefGene Table file "%s" should end with ".txt.gz".\n' "${gene_pred_filename}";
return 1;
fi
if [ ! -f "${genome_2bit_filename}" ] ; then
printf 'ERROR: Genome 2bit file "%s" could not be found.\n' "${genome_2bit_filename}";
return 1;
fi
if [ "${genome_2bit_filename}" = "${genome_2bit_filename%.2bit}" ] ; then
printf 'ERROR: Genome 2bit file "%s" should end with ".2bit".\n' "${genome_2bit_filename}";
return 1;
fi
if [ $(type sqlite3 > /dev/null 2>&1; echo $?;) -ne 0 ] ; then
printf 'ERROR: Add "sqlite3" to ${PATH}.\n';
return 1;
fi
if [ $(type twoBitInfo > /dev/null 2>&1; echo $?;) -ne 0 ] ; then
printf 'ERROR: Add "twoBitInfo" (http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/twoBitInfo) to ${PATH}.\n';
return 1;
fi
# Specify all SQL queries.
# Original SQL scheme for refGene table can be found at:
# ftp://hgdownload.cse.ucsc.edu/goldenPath/<assembly>/database/refGene.sql
local CREATE_TABLE_GENES_QUERY='
CREATE TABLE `genes` (
`geneID` VARCHAR(255) NOT NULL,
`chromosome` VARCHAR(255) NOT NULL,
`strand` CHAR(1) NOT NULL,
`txStart` INTEGER NOT NULL,
`txEnd` INTEGER NOT NULL,
`cdsStart` INTEGER NOT NULL,
`cdsEnd` INTEGER NOT NULL,
`exonCount` INTEGER NOT NULL,
`exonStarts` LONGBLOB NOT NULL,
`exonEnds` LONGBLOB NOT NULL,
`geneName` VARCHAR(255) NOT NULL
);
'
local CREATE_INDICES_FOR_GENES_TABLE_QUERY='
CREATE INDEX `geneID` ON `genes` (`geneID`);
CREATE INDEX `geneName` ON `genes` (`geneName`);
CREATE INDEX `location` ON `genes` (`chromosome`, `strand`, `txStart`, `txEnd`);
';
local CREATE_TABLE_CHROMOSOMES_QUERY='
CREATE TABLE `chromosomes` (
`name` VARCHAR(255) NOT NULL,
`length` INTEGER NOT NULL
);'
local CREATE_INDEX_FOR_CHROMOSOMES_TABLE_QUERY='
CREATE INDEX `name` ON `chromosomes` (`name`);
'
# Remove old database file, if it exists.
rm -f "${gene_sqlite3_database_filename}";
# Create "genes" and "chromsomes" tables.
sqlite3 -line \
"${gene_sqlite3_database_filename}" \
"${CREATE_TABLE_GENES_QUERY}${CREATE_TABLE_CHROMOSOMES_QUERY}"
# Import refGene data in "genes" table:
# - Only keep the following columns:
# name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, name2
zcat "${gene_pred_filename}" \
| awk \
-F '\t' \
-v 'OFS=\t' \
'
{
if ($1 ~ /^#/) {
# Skip header line.
next;
} else if (NF == 15) {
# GenePred file without "bin" column.
print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $12;
} else if (NF == 16) {
# GenePred file with "bin" column.
print $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $13;
}
}' \
| sqlite3 \
-separator $'\t' \
-line \
"${gene_sqlite3_database_filename}" \
'.import /dev/stdin genes';
# Import chromosome name and chromosome size data in "chromosomes" tables.
twoBitInfo "${genome_2bit_filename}" stdout \
| sqlite3 \
-separator $'\t' \
-line \
"${gene_sqlite3_database_filename}" \
'.import /dev/stdin chromosomes';
# Create indices for "genes" and "chromosomes" tables.
sqlite3 \
-line \
"${gene_sqlite3_database_filename}" \
"${CREATE_INDICES_FOR_GENES_TABLE_QUERY}${CREATE_INDEX_FOR_CHROMOSOMES_TABLE_QUERY}";
}
create_genes_database "${@}";