-
Notifications
You must be signed in to change notification settings - Fork 0
/
cap_schema.py
executable file
·321 lines (255 loc) · 9.48 KB
/
cap_schema.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
#!/usr/bin/python2.4
#
# Copyright 2009 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Datastore models for storing CAP.
In addition to direct subclasses of db.Model, this module defines Shadow*
classes that handle dereferencing of types that are lists of references, as
well as derived properties that are available for Django templates. For
example, ShadowCrawl is a subclass of Crawl, and it adds the 'feeds' derived
property.
References to the 'CAP 1.1' standard refer to sections in this document:
http://www.oasis-open.org/committees/download.php/14759/emergency-CAPv1.1.pdf
"""
__author__ = '[email protected] (Matt Frantz)'
import datetime
try:
# google3
from google3.apphosting.ext import db
from google3.pyglib import logging
from google3.dotorg.gongo.appengine_cap2kml import db_util
from google3.dotorg.gongo.appengine_cap2kml import web_query
except ImportError:
import logging
from google.appengine.ext import db
import db_util
import web_query
# TODO(Matt Frantz): Decide which attributes should be TextProperty and which
# should be StringProperty. TextProperty is unlimited length, but can't be an
# index key. StringProperty is <500 characters, but can be indexed (and thus
# filtered in a query).
class Crawl(db.Model):
"""Persistent state of a single crawl."""
is_done = db.BooleanProperty(default=False)
started = db.DateTimeProperty()
finished = db.DateTimeProperty()
feed_urls = db.StringListProperty()
DEFAULT_CRAWL_PERIOD_IN_MINUTES = 60
DEFAULT_CRAWL_PERIOD = datetime.timedelta(
minutes=DEFAULT_CRAWL_PERIOD_IN_MINUTES)
class Feed(db.Model):
"""Site containing CAP data."""
url = db.StringProperty()
is_crawlable = db.BooleanProperty(default=True)
is_root = db.BooleanProperty(default=True)
crawl_period_in_minutes = db.IntegerProperty(
default=DEFAULT_CRAWL_PERIOD_IN_MINUTES)
last_crawl = db.Reference(Crawl)
def __str__(self):
return str(db_util.ModelAsDict(Feed, self))
class CapAlert(db.Model):
"""CAP file from a feed."""
crawl = db.Reference(Crawl)
feed = db.Reference(Feed)
url = db.StringProperty()
text = db.TextProperty()
parse_errors = db.ListProperty(db.Text)
# CAP alert properties that we care about. (CAP 1.1 sec 3.2.1)
identifier = db.StringProperty()
sender = db.StringProperty()
sent = db.DateTimeProperty()
status = db.StringProperty() # enum
msgType = db.StringProperty() # enum
source = db.StringProperty()
scope = db.StringProperty() # enum
restriction = db.StringProperty()
# TODO(Matt Frantz): Save "addresses" when Datastore has text search.
code = db.StringListProperty()
# TODO(Matt Frantz): Save "note" when Datastore has text search.
references = db.StringListProperty()
# TODO(Matt Frantz): Save "incidents" when Datastore has text search.
# Info.
language = db.StringListProperty()
category = db.StringListProperty() # enum
# TODO(Matt Frantz): Save "event" when Datastore has text search.
responseType = db.StringListProperty() # enum
urgency = db.StringListProperty() # enum
severity = db.StringListProperty() # enum
certainty = db.StringListProperty() # enum
audience = db.StringListProperty()
# TODO(Matt Frantz): Save "eventCode" tag/value pairs.
effective = db.ListProperty(datetime.datetime)
onset = db.ListProperty(datetime.datetime)
expires = db.ListProperty(datetime.datetime)
senderName = db.StringListProperty()
# TODO(Matt Frantz): Save "headline" when Datastore has text search.
# TODO(Matt Frantz): Save "description" when Datastore has text search.
# TODO(Matt Frantz): Save "instruction" when Datastore has text search.
web = db.StringListProperty() # URI
contact = db.StringListProperty()
# TODO(Matt Frantz): Save "parameter" tag/value pairs.
# TODO(Matt Frantz): Save "eventCode".
# Resource.
resourceDesc = db.StringListProperty()
mimeType = db.StringListProperty()
size = db.ListProperty(long) # unit?
uri = db.StringListProperty() # URI
# TODO(Matt Frantz): Save "derefUri"?
# TODO(Matt Frantz): Save "digest"?
# Area.
# TODO(Matt Frantz): Save "areaDesc" when Datastore has text search.
# TODO(Matt Frantz): Save "polygon" in an indexable way.
# TODO(Matt Frantz): Save "circle" in an indexable way.
# TODO(Matt Frantz): Save "geocode" tag/value pairs?
altitude = db.ListProperty(float)
ceiling = db.ListProperty(float)
def __str__(self):
return str(db_util.ModelAsDict(Cap, self))
class CrawlShard(db.Model):
"""Single atom of crawl work, which is a URL."""
crawl = db.Reference(Crawl)
feed = db.Reference(Feed)
url = db.TextProperty()
is_done = db.BooleanProperty(default=False)
# When is_done is True, then the following may be populated.
started = db.DateTimeProperty()
finished = db.DateTimeProperty()
error = db.TextProperty()
parse_errors = db.ListProperty(db.Text)
class ShadowCrawl(Crawl):
"""Shadow for Crawl that provides derived properties."""
def __init__(self, crawl):
"""Initializes a ShadowCrawl from a Crawl.
Args:
crawl: Crawl object
"""
super(ShadowCrawl, self).__init__(**db_util.ModelAsDict(Crawl, crawl))
self.__feeds = None
self.__shards = None
self.__shards_remaining = None
self.__key = crawl.key()
def key(self):
return self.__key
@property
def feeds(self):
"""Returns the set of feeds involved in this crawl.
Returns:
Set of Feed objects.
"""
if not self.__feeds:
feeds = set()
for shard in self.shards:
feed = db_util.SafelyDereference(shard, 'feed')
if feed:
feeds.add(feed)
self.__feeds = frozenset(feeds)
return self.__feeds
@property
def shards(self):
"""Returns all shards for this crawl.
Returns:
List of CrawlShard objects.
"""
if not self.__shards:
self.__shards = DereferenceFilterAndShadow(self, 'CrawlShard',
shadow_class=ShadowCrawlShard)
return self.__shards
class ShadowCrawlShard(CrawlShard):
"""Shadow for CrawlShard that provides derived properties."""
def __init__(self, crawl_shard):
"""Initializes a ShadowCrawlShard from a CrawlShard.
Args:
crawl_shard: CrawlShard object
"""
super(ShadowCrawlShard, self).__init__(
**db_util.ModelAsDict(CrawlShard, crawl_shard))
class ShadowFeed(Feed):
"""Shadow for Feed that provides derived properties."""
def __init__(self, feed):
"""Initializes a ShadowFeed from a Feed.
Args:
feed: Feed object
"""
super(ShadowFeed, self).__init__(**db_util.ModelAsDict(Feed, feed))
self.__key = feed.key()
def key(self):
return self.__key
def _FilterModels(model_name, models, web_query):
for model in models:
if web_query.PermitsModel(model_name, model):
yield model
def _ShadowModels(shadow_class, models):
for model in models:
shadow = shadow_class(model)
yield shadow
def DereferenceFilterAndShadow(referenced_model, subordinate_model_name,
shadow_class=None, web_query=None):
"""Dereferences a set of keys, filters and shadows the resulting objects.
Args:
referenced_model: Model object being referenced. It must have a standard
back-pointer property for the subordinate_model_name.
subordinate_model_name: Name of the model of the dereferenced objects.
Model class must include a ReferenceProperty that refers to the
referenced_model.
shadow_class: Class object for the shadow of the dereferenced objects,
or None for no shadowing.
query: web_query.Query object or None for no filter.
Returns:
List of the subordinate models or their shadows.
"""
# Use the back-pointers to dereference.
subordinate_models = db_util.SafelyDereference(
referenced_model,
'%s_set' % subordinate_model_name.lower())
if not subordinate_models:
return []
# Apply any web query.
if web_query:
subordinate_models = _FilterModels(
subordinate_model_name, subordinate_models, web_query)
# Wrap in the shadow class.
if shadow_class:
subordinate_models = _ShadowModels(shadow_class, subordinate_models)
return list(subordinate_models)
def LastCrawls():
"""Returns the last completed crawl for each feed.
Returns:
set of Crawl objects, empty if there are no crawls
"""
# We only need to look at root feeds, because the children are only
# crawled when their root is crawled.
query = Feed.gql('WHERE is_root = :1', True)
crawls = set()
limit = 100
offset = 0
while True:
feeds = query.fetch(limit, offset=offset)
offset += limit
if feeds:
for feed in feeds:
crawl = feed.last_crawl
if crawl:
crawls.add(crawl.key())
else:
return crawls
# TODO(Matt Frantz): Remove obsolete models, which are sticking around only to
# allow them to be purged.
class Cap(db.Model):
crawl = db.Reference(Crawl)
class CapInfo(db.Model):
crawl = db.Reference(Crawl)
class CapArea(db.Model):
crawl = db.Reference(Crawl)
class CapResource(db.Model):
crawl = db.Reference(Crawl)