-
Notifications
You must be signed in to change notification settings - Fork 21
/
create_taxonomy_db.sh
executable file
·136 lines (111 loc) · 4.03 KB
/
create_taxonomy_db.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/bin/bash
#
# create_taxonomy_db.sh
#
# This script creates the SQLite taxonomy database using NCBI downloadable files
#
# Chiu Laboratory
# University of California, San Francisco
# January, 2014
#
# Copyright (C) 2014 Scot Federman - All Rights Reserved
# SURPI has been released under a modified BSD license.
# Please see license file for details.
scriptname=${0##*/}
# FIXME remove hard-coding; how to specify?
tag_db_file="/usr/local/bin/surpi-dev/tagging_list_5.txt"
while getopts ":d:ghm:" option; do
case "${option}" in
d) db_directory=${OPTARG};;
g) GI=1;;
h) HELP=1;;
m) MERGED=${OPTARG};;
:) echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
if [[ ${HELP} -eq 1 || $# -lt 1 ]]
then
cat <<USAGE
${bold}$scriptname${normal}
This script will create the taxonomy SQLite database using NCBI downloadable files.
${bold}Command Line Switches:${normal}
-h Show this help
-g Base databases on GI numbers (default: use accession)
-d Specify directory containing NCBI data
-m Specify whether to adjust taxid using merged.dmp [T (default)/F]
This step will use the merged.dmp file (from NCBI taxonomy). This file lists old taxids and
their new taxid.
${bold}Usage:${normal}
USAGE
exit
fi
if [[ -z $MERGED ]]
then
MERGED="T"
elif [[ $MERGED != "T" && $MERGED != "F" ]]
then
echo "-m option must be T or F."
exit
fi
# New lookup files have 4 columns
# accession accession.version taxid gi
# legacy GI
if [[ ${GI} -eq 1 ]]; then
if [ -f "$db_directory/taxdump.tar.gz" ] && [ -f "$db_directory/gi_taxid_nucl.dmp.gz" ] && [ -f "$db_directory/gi_taxid_prot.dmp.gz" ]; then
echo -e "$(date)\t$scriptname\tTaxonomy files found."
else
echo -e "$(date)\t$scriptname\tNecessary files not found. Exiting..."
exit
fi
echo -e "$(date)\t$scriptname\tUnzipping downloads..."
tar xfz "$db_directory/taxdump.tar.gz"
pigz -dc -k "$db_directory/gi_taxid_nucl.dmp.gz" > gi_taxid_nucl.dmp
pigz -dc -k "$db_directory/gi_taxid_prot.dmp.gz" > gi_taxid_prot.dmp
# the below grep "fixes" the issue whereby aliases, mispellings, and other alternate names are returned.
# We could simply look for a name that is a "scientific name",
# but this shrinks the db a bit, speeding up lookups, and removes data we do not need at this time.
echo -e "$(date)\t$scriptname\tRetaining scientific names..."
grep "scientific name" names.dmp > names_scientificname.dmp
echo -e "$(date)\t$scriptname\tStarting creation of taxonomy SQLite databases..."
if [[ $MERGED == "T" ]]
then
create_taxonomy_db.py --gi --merge
else
create_taxonomy_db.py --gi
fi
else
# ACCESSIONS
if [ -f "$db_directory/taxdump.tar.gz" ] && [ -f "$db_directory/nucl_gb.accession2taxid.gz" ] && [ -f "$db_directory/prot.accession2taxid.gz" ]; then
echo -e "$(date)\t$scriptname\tTaxonomy files found."
else
echo -e "$(date)\t$scriptname\tNecessary files not found. Exiting..."
exit
fi
echo -e "$(date)\t$scriptname\tUnzipping downloads..."
tar xfz "$db_directory/taxdump.tar.gz"
pigz -dc -k "$db_directory/nucl_gb.accession2taxid.gz" > nucl_gb.accession2taxid
pigz -dc -k "$db_directory/prot.accession2taxid.gz" > prot.accession2taxid
# the below grep "fixes" the issue whereby aliases, mispellings, and other alternate names are returned.
# We could simply look for a name that is a "scientific name",
# but this shrinks the db a bit, speeding up lookups, and removes data we do not need at this time.
echo -e "$(date)\t$scriptname\tRetaining scientific names..."
grep "scientific name" names.dmp > names_scientificname.dmp
echo -e "$(date)\t$scriptname\tStarting creation of taxonomy SQLite databases..."
if [[ $MERGED == "T" ]]
then
create_taxonomy_db.py --merge
else
create_taxonomy_db.py
fi
fi
# Add tags
# Above makes a big mess of files all in current dir so the tax db file
# here is the correct path, i.e., current dir
tax_db_file="names_nodes_scientific.db"
tagTaxonomy.py load --tagfile $tag_db_file --taxdb $tax_db_file
echo -e "$(date)\t$scriptname\tCompleted creation of taxonomy SQLite databases."
rm -f *.dmp
rm -f gc.prt
rm -f readme.txt