Skip to content

Commit

Permalink
Merge pull request #18 from zmap/cdz/add-human-accessible-outputs
Browse files Browse the repository at this point in the history
Add JSON schema descriptions for BQ and ES
  • Loading branch information
cdzombak authored Apr 10, 2018
2 parents 62839e3 + 62a58a8 commit 0e3d8fd
Show file tree
Hide file tree
Showing 4 changed files with 306 additions and 52 deletions.
24 changes: 7 additions & 17 deletions zschema/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@

def usage():
sys.stderr.write("USAGE: %s command schema [file].\n" % sys.argv[0].split("/")[-1])
sys.stderr.write("Valid commands: bigquery, elasticsearch, json, text, html, censys-html, flat, validate.\n")
sys.stderr.write("schema should be defined as file.py:record\n")
sys.stderr.write("Valid commands: bigquery, elasticsearch, docs-es, docs-bq, json, flat, validate.\n")
sys.stderr.write("Schema should be passed as file.py:record\n")
sys.stderr.write("The optional 'file' argument is used only as the test file for the 'validate' command.\n")
sys.stderr.write("VERSION: %s\n" % zschema.__version__)
sys.exit(1)

Expand All @@ -27,26 +28,15 @@ def main():
print json.dumps(record.to_bigquery())
elif command == "elasticsearch":
print json.dumps(record.to_es(recname))
elif command == "docs-es":
print json.dumps(record.docs_es(recname))
elif command == "docs-bq":
print json.dumps(record.docs_bq(recname))
elif command == "json":
print record.to_json()
elif command == "html":
for r in record.to_flat():
type_ = r.get("es_type", "")
print "<tr><td>%s</td><td>%s</td></tr>" % (r["name"], type_)
elif command == "text":
print record.to_text()
elif command == "flat":
for r in record.to_flat():
print json.dumps(r)
elif command == "censys-html":
for r in record.to_flat():
type_ = r.get("es_type", None)
len_ = r["name"].count(".")
style = 'style="padding-left: %ipx"' % (15 * len_ + 5)
if not type_:
print '<tr class="record"><td %s>%s</td><td>%s</td></tr>' % (style, r["name"], "")
else:
print "<tr><td %s>%s</td><td>%s</td></tr>" % (style, r["name"], type_)
elif command == "validate":
if not os.path.exists(sys.argv[3]):
sys.stderr.write("Invalid test file. %s does not exist.\n" % sys.argv[3])
Expand Down
103 changes: 83 additions & 20 deletions zschema/compounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ def _is_valid_object(name, object_):

class ListOf(Keyable):

def __init__(self, object_, max_items=10):
def __init__(self, object_, max_items=10, doc=None, category=None):
self.object_ = object_
self.max_items = max_items
self.category = category
self.doc = doc
_is_valid_object("Anonymous ListOf", object_)

@property
Expand All @@ -33,9 +35,27 @@ def to_bigquery(self, name):
retv["mode"] = "REPEATED"
return retv

def docs_bq(self, parent_category=None):
retv = self.object_.docs_bq()
category = self.category or parent_category
retv["category"] = category
retv["repeated"] = True
if self.doc:
retv["doc"] = self.doc
return retv

def to_es(self):
return self.object_.to_es()

def docs_es(self, parent_category=None):
retv = self.object_.docs_es()
category = self.category or parent_category
retv["category"] = category
retv["repeated"] = True
if self.doc:
retv["doc"] = self.doc
return retv

def validate(self, name, value):
if type(value) != list:
raise DataValidationException("%s: %s is not a list",
Expand All @@ -59,11 +79,13 @@ def __init__(self,
doc=None,
extends=None,
allow_unknown=False,
exclude=None):
exclude=None,
category=None):
self.definition = definition
self.required = required
self.allow_unknown = allow_unknown
self.doc = doc
self.category = category
self._exclude = set(exclude) if exclude else set([])
# merge
if extends:
Expand Down Expand Up @@ -113,14 +135,25 @@ def merge(self, other):
return self

def to_bigquery(self, name):
fields = [v.to_bigquery(k) for (k,v) in sorted(self.definition.iteritems()) if \
not v.exclude_bigquery]
return {
fields = [v.to_bigquery(k) \
for (k,v) in sorted(self.definition.iteritems()) \
if not v.exclude_bigquery
]
retv = {
"name":self.key_to_bq(name),
"type":"RECORD",
"fields":fields,
"mode":"REQUIRED" if self.required else "NULLABLE"
}
return retv

def docs_bq(self, parent_category=None):
retv = self._docs_common(parent_category=parent_category)
fields = { self.key_to_bq(k): v.docs_bq() \
for (k,v) in sorted(self.definition.iteritems()) \
if not v.exclude_bigquery }
retv["fields"] = fields
return retv

def print_indent_string(self, name, indent):
tabs = "\t" * indent if indent else ""
Expand All @@ -129,10 +162,28 @@ def print_indent_string(self, name, indent):
value.print_indent_string(name, indent+1)

def to_es(self):
p = {self.key_to_es(k): v.to_es() for k, v in sorted(self.definition.iteritems()) \
p = {self.key_to_es(k): v.to_es() \
for k, v in sorted(self.definition.iteritems()) \
if not v.exclude_elasticsearch}
return {"properties": p}

def _docs_common(self, parent_category):
category = self.category or parent_category
retv = {
"category": category,
"doc": self.doc,
"type": self.__class__.__name__,
"required": self.required,
}
return retv

def docs_es(self, parent_category=None):
retv = self._docs_common(parent_category=parent_category)
retv["fields"] = { self.key_to_es(k): v.docs_es() \
for k, v in sorted(self.definition.iteritems()) \
if not v.exclude_elasticsearch }
return retv

def to_dict(self):
source = sorted(self.definition.iteritems())
p = {self.key_to_es(k): v.to_dict() for k, v in source}
Expand All @@ -152,8 +203,8 @@ def validate(self, name, value):

class NestedListOf(ListOf):

def __init__(self, object_, subrecord_name, max_items=10):
ListOf.__init__(self, object_, max_items)
def __init__(self, object_, subrecord_name, max_items=10, doc=None, category=None):
ListOf.__init__(self, object_, max_items, doc=doc, category=category)
self.subrecord_name = subrecord_name

def to_bigquery(self, name):
Expand All @@ -162,6 +213,19 @@ def to_bigquery(self, name):
})
retv = subr.to_bigquery(self.key_to_bq(name))
retv["mode"] = "REPEATED"
if self.doc:
retv["doc"] = self.doc
return retv

def docs_bq(self, parent_category=None):
subr = SubRecord({
self.subrecord_name: ListOf(self.object_)
})
category = self.category or parent_category
retv = subr.docs_bq(parent_category=category)
retv["repeated"] = True
if self.doc:
retv["doc"] = self.doc
return retv


Expand All @@ -170,24 +234,25 @@ class Record(SubRecord):
def to_es(self, name):
return {name:SubRecord.to_es(self)}

def docs_es(self, name, parent_category=None):
category = self.category or parent_category
return {name: SubRecord.docs_es(self, parent_category=category)}

def to_bigquery(self):
source = sorted(self.definition.iteritems())
return [s.to_bigquery(name) for (name, s) in source \
if not s.exclude_bigquery]

def to_html(self):
pass
return [s.to_bigquery(name) \
for (name, s) in source \
if not s.exclude_bigquery
]

def to_documented_html(self):
pass
def docs_bq(self, name, parent_category=None):
category = self.category or parent_category
return {name: SubRecord.docs_bq(self, parent_category=category)}

def print_indent_string(self):
for name, field in sorted(self.definition.iteritems()):
field.print_indent_string(name, 0)

def to_dotted_text(self):
pass

def validate(self, value):
if type(value) != dict:
raise DataValidationException("record is not a dict", str(value))
Expand All @@ -212,5 +277,3 @@ def to_flat(self):
@classmethod
def from_json(cls, j):
return cls({(k, __encode(v)) for k, v in sorted(j.iteritems())})


41 changes: 30 additions & 11 deletions zschema/leaves.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@ def __init__(self,
es_index=None,
es_analyzer=None,
doc=None,
examples=None,
es_include_raw=None,
deprecated=False,
ignore=False,
autocomplete_include=True,
autocomplete_category=None,
autocomplete_icon=None,
category=None,
exclude=None,
metadata=None,
units=None,
Expand All @@ -30,6 +29,7 @@ def __init__(self,
self.es_index = es_index
self.es_analyzer = es_analyzer
self.doc = doc
self.examples = examples if examples else []
if es_include_raw is not None:
self.es_include_raw = es_include_raw
else:
Expand All @@ -40,9 +40,7 @@ def __init__(self,
e = "WARN: %s is deprecated and will be removed in a "\
"future release\n" % self.__class__.__name__
sys.stderr.write(e)
self.autocomplete_category = autocomplete_category
self.autocomplete_category = autocomplete_category
self.autocomplete_icon = autocomplete_icon
self.category = category
self._exclude = set(exclude) if exclude else set([])
self.metadata = metadata if metadata else {}
self.units = units
Expand All @@ -56,7 +54,8 @@ def to_dict(self):
"type":self.__class__.__name__,
"es_type":self.ES_TYPE,
"bq_type":self.BQ_TYPE,
"metadata":self.metadata
"metadata":self.metadata,
"examples": self.examples,
}
if self.units is not None:
retv["units"] = self.units
Expand All @@ -72,13 +71,36 @@ def to_es(self):
self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER")
self.add_es_var(retv, "search_analyzer", "es_search_analyzer",
"ES_SEARCH_ANALYZER")

if self.es_include_raw:
retv["fields"] = {
"raw":{"type":"keyword"}
}
return retv

def _docs_common(self, parent_category):
retv = {
"detail_type": self.__class__.__name__,
"category": self.category or parent_category,
"doc": self.doc,
"required": self.required,
}
if hasattr(self, "values_s") and len(self.values_s):
retv["values"] = list(self.values_s)
else:
retv["examples"] = self.examples
return retv

def docs_es(self, parent_category=None):
retv = self._docs_common(parent_category)
self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER")
retv["type"] = self.ES_TYPE
return retv

def docs_bq(self, parent_category=None):
retv = self._docs_common(parent_category)
retv["type"] = self.BQ_TYPE
return retv

def to_bigquery(self, name):
if not self._check_valid_name(name):
raise Exception("Invalid field name: %s" % name)
Expand Down Expand Up @@ -118,9 +140,6 @@ def to_flat(self, parent, name, repeated=False):
"mode":mode
}

def to_autocomplete(self, parent, name, repated=False):
pass

def print_indent_string(self, name, indent):
val = self.key_to_string(name)
if indent:
Expand Down
Loading

0 comments on commit 0e3d8fd

Please sign in to comment.