From dcb50852a1320783f824309fe6c1003f0cf8b64c Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Thu, 5 Apr 2018 16:15:01 -0400 Subject: [PATCH 01/17] Remove html output formats --- zschema/__main__.py | 17 ++--------------- zschema/compounds.py | 11 ----------- 2 files changed, 2 insertions(+), 26 deletions(-) diff --git a/zschema/__main__.py b/zschema/__main__.py index 321a35f..b91d654 100644 --- a/zschema/__main__.py +++ b/zschema/__main__.py @@ -11,8 +11,8 @@ def usage(): sys.stderr.write("USAGE: %s command schema [file].\n" % sys.argv[0].split("/")[-1]) - sys.stderr.write("Valid commands: bigquery, elasticsearch, json, text, html, censys-html, flat, validate.\n") - sys.stderr.write("schema should be defined as file.py:record\n") + sys.stderr.write("Valid commands: bigquery, elasticsearch, json, text, flat, validate.\n") + sys.stderr.write("Schema should be passed as file.py:record\n") sys.stderr.write("VERSION: %s\n" % zschema.__version__) sys.exit(1) @@ -29,24 +29,11 @@ def main(): print json.dumps(record.to_es(recname)) elif command == "json": print record.to_json() - elif command == "html": - for r in record.to_flat(): - type_ = r.get("es_type", "") - print "%s%s" % (r["name"], type_) elif command == "text": print record.to_text() elif command == "flat": for r in record.to_flat(): print json.dumps(r) - elif command == "censys-html": - for r in record.to_flat(): - type_ = r.get("es_type", None) - len_ = r["name"].count(".") - style = 'style="padding-left: %ipx"' % (15 * len_ + 5) - if not type_: - print '%s%s' % (style, r["name"], "") - else: - print "%s%s" % (style, r["name"], type_) elif command == "validate": if not os.path.exists(sys.argv[3]): sys.stderr.write("Invalid test file. %s does not exist.\n" % sys.argv[3]) diff --git a/zschema/compounds.py b/zschema/compounds.py index d107b60..0632634 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -175,19 +175,10 @@ def to_bigquery(self): return [s.to_bigquery(name) for (name, s) in source \ if not s.exclude_bigquery] - def to_html(self): - pass - - def to_documented_html(self): - pass - def print_indent_string(self): for name, field in sorted(self.definition.iteritems()): field.print_indent_string(name, 0) - def to_dotted_text(self): - pass - def validate(self, value): if type(value) != dict: raise DataValidationException("record is not a dict", str(value)) @@ -212,5 +203,3 @@ def to_flat(self): @classmethod def from_json(cls, j): return cls({(k, __encode(v)) for k, v in sorted(j.iteritems())}) - - From 90a5397cd3f79118780ddb6ba10b14fe52b4582a Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Thu, 5 Apr 2018 16:15:14 -0400 Subject: [PATCH 02/17] Improve usage docs --- zschema/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zschema/__main__.py b/zschema/__main__.py index b91d654..d2ac3f5 100644 --- a/zschema/__main__.py +++ b/zschema/__main__.py @@ -13,6 +13,7 @@ def usage(): sys.stderr.write("USAGE: %s command schema [file].\n" % sys.argv[0].split("/")[-1]) sys.stderr.write("Valid commands: bigquery, elasticsearch, json, text, flat, validate.\n") sys.stderr.write("Schema should be passed as file.py:record\n") + sys.stderr.write("The optional 'file' argument is used only as the test file for the 'validate' command.\n") sys.stderr.write("VERSION: %s\n" % zschema.__version__) sys.exit(1) From b14cbd8d8e328c2b8140b6646e21c59fba1051b1 Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Thu, 5 Apr 2018 16:57:44 -0400 Subject: [PATCH 03/17] Add an annotated elasticsearch output, which includes docs for (sub)records & leaves --- zschema/__main__.py | 4 +++- zschema/compounds.py | 18 +++++++++++------- zschema/leaves.py | 4 +++- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/zschema/__main__.py b/zschema/__main__.py index d2ac3f5..7215a9d 100644 --- a/zschema/__main__.py +++ b/zschema/__main__.py @@ -11,7 +11,7 @@ def usage(): sys.stderr.write("USAGE: %s command schema [file].\n" % sys.argv[0].split("/")[-1]) - sys.stderr.write("Valid commands: bigquery, elasticsearch, json, text, flat, validate.\n") + sys.stderr.write("Valid commands: bigquery, elasticsearch, es-annotated, json, text, flat, validate.\n") sys.stderr.write("Schema should be passed as file.py:record\n") sys.stderr.write("The optional 'file' argument is used only as the test file for the 'validate' command.\n") sys.stderr.write("VERSION: %s\n" % zschema.__version__) @@ -28,6 +28,8 @@ def main(): print json.dumps(record.to_bigquery()) elif command == "elasticsearch": print json.dumps(record.to_es(recname)) + elif command == "es-annotated": + print json.dumps(record.to_es(recname, annotated=True)) elif command == "json": print record.to_json() elif command == "text": diff --git a/zschema/compounds.py b/zschema/compounds.py index 0632634..04ed984 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -33,8 +33,8 @@ def to_bigquery(self, name): retv["mode"] = "REPEATED" return retv - def to_es(self): - return self.object_.to_es() + def to_es(self, annotated=False): + return self.object_.to_es(annotated=annotated) def validate(self, name, value): if type(value) != list: @@ -128,10 +128,14 @@ def print_indent_string(self, name, indent): for name, value in sorted(self.definition.iteritems()): value.print_indent_string(name, indent+1) - def to_es(self): - p = {self.key_to_es(k): v.to_es() for k, v in sorted(self.definition.iteritems()) \ + def to_es(self, annotated=False): + p = {self.key_to_es(k): v.to_es(annotated=annotated) \ + for k, v in sorted(self.definition.iteritems()) \ if not v.exclude_elasticsearch} - return {"properties": p} + retv = {"properties": p} + if annotated and self.doc: + retv["doc"] = self.doc + return retv def to_dict(self): source = sorted(self.definition.iteritems()) @@ -167,8 +171,8 @@ def to_bigquery(self, name): class Record(SubRecord): - def to_es(self, name): - return {name:SubRecord.to_es(self)} + def to_es(self, name, annotated=False): + return {name:SubRecord.to_es(self, annotated=annotated)} def to_bigquery(self): source = sorted(self.definition.iteritems()) diff --git a/zschema/leaves.py b/zschema/leaves.py index f5829ce..77fd05f 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -66,7 +66,7 @@ def to_dict(self): "ES_SEARCH_ANALYZER") return retv - def to_es(self): + def to_es(self, annotated=False): retv = {"type":self.ES_TYPE} self.add_es_var(retv, "index", "es_index", "ES_INDEX") self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER") @@ -77,6 +77,8 @@ def to_es(self): retv["fields"] = { "raw":{"type":"keyword"} } + if annotated and self.doc: + retv["doc"] = self.doc return retv def to_bigquery(self, name): From 9d1ec4e8244f722525f82a16aaced8d9f7bbdd9b Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Thu, 5 Apr 2018 17:00:48 -0400 Subject: [PATCH 04/17] =?UTF-8?q?Remove=20unimplemented=20=E2=80=98text?= =?UTF-8?q?=E2=80=99=20command?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zschema/__main__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/zschema/__main__.py b/zschema/__main__.py index 7215a9d..1dd036d 100644 --- a/zschema/__main__.py +++ b/zschema/__main__.py @@ -11,7 +11,7 @@ def usage(): sys.stderr.write("USAGE: %s command schema [file].\n" % sys.argv[0].split("/")[-1]) - sys.stderr.write("Valid commands: bigquery, elasticsearch, es-annotated, json, text, flat, validate.\n") + sys.stderr.write("Valid commands: bigquery, elasticsearch, es-annotated, json, flat, validate.\n") sys.stderr.write("Schema should be passed as file.py:record\n") sys.stderr.write("The optional 'file' argument is used only as the test file for the 'validate' command.\n") sys.stderr.write("VERSION: %s\n" % zschema.__version__) @@ -32,8 +32,6 @@ def main(): print json.dumps(record.to_es(recname, annotated=True)) elif command == "json": print record.to_json() - elif command == "text": - print record.to_text() elif command == "flat": for r in record.to_flat(): print json.dumps(r) From dece7229ba017be2eecd76037a2062b87d777e1b Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Thu, 5 Apr 2018 17:06:31 -0400 Subject: [PATCH 05/17] Include a more detailed type for annotated ES output --- zschema/leaves.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zschema/leaves.py b/zschema/leaves.py index 77fd05f..da0744a 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -77,8 +77,10 @@ def to_es(self, annotated=False): retv["fields"] = { "raw":{"type":"keyword"} } - if annotated and self.doc: - retv["doc"] = self.doc + if annotated: + if self.doc: + retv["doc"] = self.doc + retv["detail_type"] = self.__class__.__name__ return retv def to_bigquery(self, name): From 7d36532723774bd18bfc3ebd20dc17819e18d3ea Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Thu, 5 Apr 2018 17:32:24 -0400 Subject: [PATCH 06/17] Add annotated bigquery output --- zschema/__main__.py | 4 +++- zschema/compounds.py | 29 ++++++++++++++++++----------- zschema/leaves.py | 4 +++- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/zschema/__main__.py b/zschema/__main__.py index 1dd036d..54c9c9f 100644 --- a/zschema/__main__.py +++ b/zschema/__main__.py @@ -11,7 +11,7 @@ def usage(): sys.stderr.write("USAGE: %s command schema [file].\n" % sys.argv[0].split("/")[-1]) - sys.stderr.write("Valid commands: bigquery, elasticsearch, es-annotated, json, flat, validate.\n") + sys.stderr.write("Valid commands: bigquery, elasticsearch, es-annotated, bq-annotated, json, flat, validate.\n") sys.stderr.write("Schema should be passed as file.py:record\n") sys.stderr.write("The optional 'file' argument is used only as the test file for the 'validate' command.\n") sys.stderr.write("VERSION: %s\n" % zschema.__version__) @@ -30,6 +30,8 @@ def main(): print json.dumps(record.to_es(recname)) elif command == "es-annotated": print json.dumps(record.to_es(recname, annotated=True)) + elif command == "bq-annotated": + print json.dumps(record.to_bigquery(annotated=True)) elif command == "json": print record.to_json() elif command == "flat": diff --git a/zschema/compounds.py b/zschema/compounds.py index 04ed984..a4a0c1b 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -28,8 +28,8 @@ def print_indent_string(self, name, indent): print tabs + name + ":%s:" % self.__class__.__name__, self.object_.print_indent_string(self.key_to_string(name), indent+1) - def to_bigquery(self, name): - retv = self.object_.to_bigquery(name) + def to_bigquery(self, name, annotated=False): + retv = self.object_.to_bigquery(name, annotated=annotated) retv["mode"] = "REPEATED" return retv @@ -112,15 +112,20 @@ def merge(self, other): self.definition = newdef return self - def to_bigquery(self, name): - fields = [v.to_bigquery(k) for (k,v) in sorted(self.definition.iteritems()) if \ - not v.exclude_bigquery] - return { + def to_bigquery(self, name, annotated=False): + fields = [v.to_bigquery(k, annotated=annotated) \ + for (k,v) in sorted(self.definition.iteritems()) \ + if not v.exclude_bigquery + ] + retv = { "name":self.key_to_bq(name), "type":"RECORD", "fields":fields, "mode":"REQUIRED" if self.required else "NULLABLE" } + if annotated and self.doc: + retv["doc"] = self.doc + return retv def print_indent_string(self, name, indent): tabs = "\t" * indent if indent else "" @@ -160,11 +165,11 @@ def __init__(self, object_, subrecord_name, max_items=10): ListOf.__init__(self, object_, max_items) self.subrecord_name = subrecord_name - def to_bigquery(self, name): + def to_bigquery(self, name, annotated=False): subr = SubRecord({ self.subrecord_name:ListOf(self.object_) }) - retv = subr.to_bigquery(self.key_to_bq(name)) + retv = subr.to_bigquery(self.key_to_bq(name), annotated=annotated) retv["mode"] = "REPEATED" return retv @@ -174,10 +179,12 @@ class Record(SubRecord): def to_es(self, name, annotated=False): return {name:SubRecord.to_es(self, annotated=annotated)} - def to_bigquery(self): + def to_bigquery(self, annotated=False): source = sorted(self.definition.iteritems()) - return [s.to_bigquery(name) for (name, s) in source \ - if not s.exclude_bigquery] + return [s.to_bigquery(name, annotated=annotated) \ + for (name, s) in source \ + if not s.exclude_bigquery + ] def print_indent_string(self): for name, field in sorted(self.definition.iteritems()): diff --git a/zschema/leaves.py b/zschema/leaves.py index da0744a..8c74c84 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -83,13 +83,15 @@ def to_es(self, annotated=False): retv["detail_type"] = self.__class__.__name__ return retv - def to_bigquery(self, name): + def to_bigquery(self, name, annotated=False): if not self._check_valid_name(name): raise Exception("Invalid field name: %s" % name) mode = "REQUIRED" if self.required else "NULLABLE" retv = {"name":self.key_to_bq(name), "type":self.BQ_TYPE, "mode":mode} if self.doc: retv["doc"] = self.doc + if annotated: + retv["detail_type"] = self.__class__.__name__ return retv def to_string(self, name): From e5d02637be242372784a71897e51083473302048 Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Thu, 5 Apr 2018 17:33:05 -0400 Subject: [PATCH 07/17] Include possible enum values in annotated docs --- zschema/leaves.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/zschema/leaves.py b/zschema/leaves.py index 8c74c84..a3c1c7f 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -81,6 +81,9 @@ def to_es(self, annotated=False): if self.doc: retv["doc"] = self.doc retv["detail_type"] = self.__class__.__name__ + if hasattr(self, "values_s") and len(self.values_s): + # gotta clean this up but for now... + retv["values"] = list(self.values_s) return retv def to_bigquery(self, name, annotated=False): @@ -92,6 +95,9 @@ def to_bigquery(self, name, annotated=False): retv["doc"] = self.doc if annotated: retv["detail_type"] = self.__class__.__name__ + if hasattr(self, "values_s") and len(self.values_s): + # gotta clean this up but for now... + retv["values"] = list(self.values_s) return retv def to_string(self, name): From b61f45ac4743a02565d0d86375aa36cdf16e9f5c Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Fri, 6 Apr 2018 10:46:48 -0400 Subject: [PATCH 08/17] Allow leaves to include a list of examples --- zschema/leaves.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/zschema/leaves.py b/zschema/leaves.py index a3c1c7f..dc709ec 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -15,6 +15,7 @@ def __init__(self, es_index=None, es_analyzer=None, doc=None, + examples=None, es_include_raw=None, deprecated=False, ignore=False, @@ -30,6 +31,7 @@ def __init__(self, self.es_index = es_index self.es_analyzer = es_analyzer self.doc = doc + self.examples = examples if examples else [] if es_include_raw is not None: self.es_include_raw = es_include_raw else: @@ -56,7 +58,8 @@ def to_dict(self): "type":self.__class__.__name__, "es_type":self.ES_TYPE, "bq_type":self.BQ_TYPE, - "metadata":self.metadata + "metadata":self.metadata, + "examples": self.examples, } if self.units is not None: retv["units"] = self.units @@ -84,6 +87,8 @@ def to_es(self, annotated=False): if hasattr(self, "values_s") and len(self.values_s): # gotta clean this up but for now... retv["values"] = list(self.values_s) + else: + retv["examples"] = self.examples return retv def to_bigquery(self, name, annotated=False): @@ -98,6 +103,8 @@ def to_bigquery(self, name, annotated=False): if hasattr(self, "values_s") and len(self.values_s): # gotta clean this up but for now... retv["values"] = list(self.values_s) + else: + retv["examples"] = self.examples return retv def to_string(self, name): From 909ab525f2ae0b1de367c84285ca9e0af61705f0 Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Fri, 6 Apr 2018 10:48:27 -0400 Subject: [PATCH 09/17] Remove unused Leaf.to_autocomplete method --- zschema/leaves.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/zschema/leaves.py b/zschema/leaves.py index dc709ec..8067ad3 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -137,9 +137,6 @@ def to_flat(self, parent, name, repeated=False): "mode":mode } - def to_autocomplete(self, parent, name, repated=False): - pass - def print_indent_string(self, name, indent): val = self.key_to_string(name) if indent: From 0ad3719ce7472bfc2e1a2193f2781f73f48118b3 Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Fri, 6 Apr 2018 12:08:16 -0400 Subject: [PATCH 10/17] =?UTF-8?q?Add=20a=20cascading=20=E2=80=9Ccategory?= =?UTF-8?q?=E2=80=9D=20property=20for=20every=20field=20in=20annotated=20o?= =?UTF-8?q?utput?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zschema/compounds.py | 49 +++++++++++++++++++++++++++++--------------- zschema/leaves.py | 26 +++++++++++++++-------- 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/zschema/compounds.py b/zschema/compounds.py index a4a0c1b..7caae2c 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -10,9 +10,10 @@ def _is_valid_object(name, object_): class ListOf(Keyable): - def __init__(self, object_, max_items=10): + def __init__(self, object_, max_items=10, category=None): self.object_ = object_ self.max_items = max_items + self.category = category _is_valid_object("Anonymous ListOf", object_) @property @@ -28,13 +29,20 @@ def print_indent_string(self, name, indent): print tabs + name + ":%s:" % self.__class__.__name__, self.object_.print_indent_string(self.key_to_string(name), indent+1) - def to_bigquery(self, name, annotated=False): + def to_bigquery(self, name, annotated=False, parent_category=None): retv = self.object_.to_bigquery(name, annotated=annotated) retv["mode"] = "REPEATED" + if annotated: + category = self.category if self.category else parent_category + retv["category"] = category return retv - def to_es(self, annotated=False): - return self.object_.to_es(annotated=annotated) + def to_es(self, annotated=False, parent_category=None): + retv = self.object_.to_es(annotated=annotated) + if annotated: + category = self.category if self.category else parent_category + retv["category"] = category + return retv def validate(self, name, value): if type(value) != list: @@ -59,11 +67,13 @@ def __init__(self, doc=None, extends=None, allow_unknown=False, - exclude=None): + exclude=None, + category=None): self.definition = definition self.required = required self.allow_unknown = allow_unknown self.doc = doc + self.category = category self._exclude = set(exclude) if exclude else set([]) # merge if extends: @@ -112,8 +122,9 @@ def merge(self, other): self.definition = newdef return self - def to_bigquery(self, name, annotated=False): - fields = [v.to_bigquery(k, annotated=annotated) \ + def to_bigquery(self, name, annotated=False, parent_category=None): + category = self.category if self.category else parent_category + fields = [v.to_bigquery(k, annotated=annotated, parent_category=category) \ for (k,v) in sorted(self.definition.iteritems()) \ if not v.exclude_bigquery ] @@ -133,8 +144,9 @@ def print_indent_string(self, name, indent): for name, value in sorted(self.definition.iteritems()): value.print_indent_string(name, indent+1) - def to_es(self, annotated=False): - p = {self.key_to_es(k): v.to_es(annotated=annotated) \ + def to_es(self, annotated=False, parent_category=None): + category = self.category if self.category else parent_category + p = {self.key_to_es(k): v.to_es(annotated=annotated, parent_category=category) \ for k, v in sorted(self.definition.iteritems()) \ if not v.exclude_elasticsearch} retv = {"properties": p} @@ -161,27 +173,32 @@ def validate(self, name, value): class NestedListOf(ListOf): - def __init__(self, object_, subrecord_name, max_items=10): - ListOf.__init__(self, object_, max_items) + def __init__(self, object_, subrecord_name, max_items=10, category=None): + ListOf.__init__(self, object_, max_items, category=category) self.subrecord_name = subrecord_name - def to_bigquery(self, name, annotated=False): + def to_bigquery(self, name, annotated=False, parent_category=None): subr = SubRecord({ self.subrecord_name:ListOf(self.object_) }) retv = subr.to_bigquery(self.key_to_bq(name), annotated=annotated) retv["mode"] = "REPEATED" + if annotated: + category = self.category if self.category else parent_category + retv["category"] = category return retv class Record(SubRecord): - def to_es(self, name, annotated=False): - return {name:SubRecord.to_es(self, annotated=annotated)} + def to_es(self, name, annotated=False, parent_category=None): + category = self.category if self.category else parent_category + return {name:SubRecord.to_es(self, annotated=annotated, parent_category=category)} - def to_bigquery(self, annotated=False): + def to_bigquery(self, annotated=False, parent_category=None): + category = self.category if self.category else parent_category source = sorted(self.definition.iteritems()) - return [s.to_bigquery(name, annotated=annotated) \ + return [s.to_bigquery(name, annotated=annotated, parent_category=category) \ for (name, s) in source \ if not s.exclude_bigquery ] diff --git a/zschema/leaves.py b/zschema/leaves.py index 8067ad3..e811470 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -19,9 +19,7 @@ def __init__(self, es_include_raw=None, deprecated=False, ignore=False, - autocomplete_include=True, - autocomplete_category=None, - autocomplete_icon=None, + category=None, exclude=None, metadata=None, units=None, @@ -42,9 +40,7 @@ def __init__(self, e = "WARN: %s is deprecated and will be removed in a "\ "future release\n" % self.__class__.__name__ sys.stderr.write(e) - self.autocomplete_category = autocomplete_category - self.autocomplete_category = autocomplete_category - self.autocomplete_icon = autocomplete_icon + self.category = category self._exclude = set(exclude) if exclude else set([]) self.metadata = metadata if metadata else {} self.units = units @@ -69,7 +65,7 @@ def to_dict(self): "ES_SEARCH_ANALYZER") return retv - def to_es(self, annotated=False): + def to_es(self, annotated=False, parent_category=None): retv = {"type":self.ES_TYPE} self.add_es_var(retv, "index", "es_index", "ES_INDEX") self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER") @@ -81,9 +77,15 @@ def to_es(self, annotated=False): "raw":{"type":"keyword"} } if annotated: + retv["detail_type"] = self.__class__.__name__ + category = self.category if self.category else parent_category + retv["category"] = category if self.doc: retv["doc"] = self.doc - retv["detail_type"] = self.__class__.__name__ + if self.min_value: + retv["min_value"] = self.min_value + if self.max_value: + retv["max_value"] = self.max_value if hasattr(self, "values_s") and len(self.values_s): # gotta clean this up but for now... retv["values"] = list(self.values_s) @@ -91,7 +93,7 @@ def to_es(self, annotated=False): retv["examples"] = self.examples return retv - def to_bigquery(self, name, annotated=False): + def to_bigquery(self, name, annotated=False, parent_category=None): if not self._check_valid_name(name): raise Exception("Invalid field name: %s" % name) mode = "REQUIRED" if self.required else "NULLABLE" @@ -100,6 +102,12 @@ def to_bigquery(self, name, annotated=False): retv["doc"] = self.doc if annotated: retv["detail_type"] = self.__class__.__name__ + category = self.category if self.category else parent_category + retv["category"] = category + if self.min_value: + retv["min_value"] = self.min_value + if self.max_value: + retv["max_value"] = self.max_value if hasattr(self, "values_s") and len(self.values_s): # gotta clean this up but for now... retv["values"] = list(self.values_s) From 6f60bfdabf4e2bfc3cd81f62b65bdf9a7f13d774 Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Mon, 9 Apr 2018 12:16:40 -0400 Subject: [PATCH 11/17] Implement new docs-es schema output --- zschema/__main__.py | 6 +++--- zschema/compounds.py | 45 ++++++++++++++++++++++++++++++-------------- zschema/leaves.py | 37 +++++++++++++++++++----------------- 3 files changed, 54 insertions(+), 34 deletions(-) diff --git a/zschema/__main__.py b/zschema/__main__.py index 54c9c9f..cc24d08 100644 --- a/zschema/__main__.py +++ b/zschema/__main__.py @@ -11,7 +11,7 @@ def usage(): sys.stderr.write("USAGE: %s command schema [file].\n" % sys.argv[0].split("/")[-1]) - sys.stderr.write("Valid commands: bigquery, elasticsearch, es-annotated, bq-annotated, json, flat, validate.\n") + sys.stderr.write("Valid commands: bigquery, elasticsearch, docs-es, docs-bq, json, flat, validate.\n") sys.stderr.write("Schema should be passed as file.py:record\n") sys.stderr.write("The optional 'file' argument is used only as the test file for the 'validate' command.\n") sys.stderr.write("VERSION: %s\n" % zschema.__version__) @@ -28,10 +28,10 @@ def main(): print json.dumps(record.to_bigquery()) elif command == "elasticsearch": print json.dumps(record.to_es(recname)) - elif command == "es-annotated": - print json.dumps(record.to_es(recname, annotated=True)) elif command == "bq-annotated": print json.dumps(record.to_bigquery(annotated=True)) + elif command == "docs-es": + print json.dumps(record.docs_es(recname)) elif command == "json": print record.to_json() elif command == "flat": diff --git a/zschema/compounds.py b/zschema/compounds.py index 7caae2c..574beed 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -37,11 +37,13 @@ def to_bigquery(self, name, annotated=False, parent_category=None): retv["category"] = category return retv - def to_es(self, annotated=False, parent_category=None): - retv = self.object_.to_es(annotated=annotated) - if annotated: - category = self.category if self.category else parent_category - retv["category"] = category + def to_es(self): + return self.object_.to_es() + + def docs_es(self, parent_category=None): + retv = self.object_.docs_es() + category = self.category if self.category else parent_category + retv["category"] = category return retv def validate(self, name, value): @@ -144,14 +146,27 @@ def print_indent_string(self, name, indent): for name, value in sorted(self.definition.iteritems()): value.print_indent_string(name, indent+1) - def to_es(self, annotated=False, parent_category=None): - category = self.category if self.category else parent_category - p = {self.key_to_es(k): v.to_es(annotated=annotated, parent_category=category) \ + def to_es(self): + p = {self.key_to_es(k): v.to_es() \ for k, v in sorted(self.definition.iteritems()) \ if not v.exclude_elasticsearch} - retv = {"properties": p} - if annotated and self.doc: - retv["doc"] = self.doc + return {"properties": p} + + def _docs_common(self, parent_category): + category = self.category if self.category else parent_category + retv = { + "category": category, + "doc": self.doc, + "type": self.__class__.__name__, + "required": self.required, + } + return retv + + def docs_es(self, parent_category=None): + retv = self._docs_common(parent_category=parent_category) + retv["fields"] = { self.key_to_es(k): v.docs_es() \ + for k, v in sorted(self.definition.iteritems()) \ + if not v.exclude_elasticsearch } return retv def to_dict(self): @@ -191,9 +206,11 @@ def to_bigquery(self, name, annotated=False, parent_category=None): class Record(SubRecord): - def to_es(self, name, annotated=False, parent_category=None): - category = self.category if self.category else parent_category - return {name:SubRecord.to_es(self, annotated=annotated, parent_category=category)} + def to_es(self, name): + return {name:SubRecord.to_es(self)} + + def docs_es(self, name, parent_category=None): + return {name: SubRecord.docs_es(self, parent_category=parent_category)} def to_bigquery(self, annotated=False, parent_category=None): category = self.category if self.category else parent_category diff --git a/zschema/leaves.py b/zschema/leaves.py index e811470..df9716d 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -65,32 +65,35 @@ def to_dict(self): "ES_SEARCH_ANALYZER") return retv - def to_es(self, annotated=False, parent_category=None): + def to_es(self): retv = {"type":self.ES_TYPE} self.add_es_var(retv, "index", "es_index", "ES_INDEX") self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER") self.add_es_var(retv, "search_analyzer", "es_search_analyzer", "ES_SEARCH_ANALYZER") - if self.es_include_raw: retv["fields"] = { "raw":{"type":"keyword"} } - if annotated: - retv["detail_type"] = self.__class__.__name__ - category = self.category if self.category else parent_category - retv["category"] = category - if self.doc: - retv["doc"] = self.doc - if self.min_value: - retv["min_value"] = self.min_value - if self.max_value: - retv["max_value"] = self.max_value - if hasattr(self, "values_s") and len(self.values_s): - # gotta clean this up but for now... - retv["values"] = list(self.values_s) - else: - retv["examples"] = self.examples + return retv + + def _docs_common(self, parent_category): + retv = { + "detail_type": self.__class__.__name__, + "category": self.category if self.category else parent_category, + "doc": self.doc, + "required": self.required, + } + if hasattr(self, "values_s") and len(self.values_s): + retv["values"] = list(self.values_s) + else: + retv["examples"] = self.examples + return retv + + def docs_es(self, parent_category=None): + retv = self._docs_common(parent_category) + self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER") + retv["type"] = self.ES_TYPE return retv def to_bigquery(self, name, annotated=False, parent_category=None): From 83809cd8c419f8060527fa7634a3ba816cab03ba Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Mon, 9 Apr 2018 14:22:55 -0400 Subject: [PATCH 12/17] Implement new docs-bq output --- zschema/__main__.py | 4 ++-- zschema/compounds.py | 57 +++++++++++++++++++++++++++++--------------- zschema/leaves.py | 20 +++++----------- 3 files changed, 46 insertions(+), 35 deletions(-) diff --git a/zschema/__main__.py b/zschema/__main__.py index cc24d08..558622b 100644 --- a/zschema/__main__.py +++ b/zschema/__main__.py @@ -28,10 +28,10 @@ def main(): print json.dumps(record.to_bigquery()) elif command == "elasticsearch": print json.dumps(record.to_es(recname)) - elif command == "bq-annotated": - print json.dumps(record.to_bigquery(annotated=True)) elif command == "docs-es": print json.dumps(record.docs_es(recname)) + elif command == "docs-bq": + print json.dumps(record.docs_bq(recname)) elif command == "json": print record.to_json() elif command == "flat": diff --git a/zschema/compounds.py b/zschema/compounds.py index 574beed..01e3ab4 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -29,12 +29,16 @@ def print_indent_string(self, name, indent): print tabs + name + ":%s:" % self.__class__.__name__, self.object_.print_indent_string(self.key_to_string(name), indent+1) - def to_bigquery(self, name, annotated=False, parent_category=None): - retv = self.object_.to_bigquery(name, annotated=annotated) + def to_bigquery(self, name): + retv = self.object_.to_bigquery(name) retv["mode"] = "REPEATED" - if annotated: - category = self.category if self.category else parent_category - retv["category"] = category + return retv + + def docs_bq(self, parent_category=None): + retv = self.object_.docs_bq() + category = self.category if self.category else parent_category + retv["category"] = category + retv["repeated"] = True return retv def to_es(self): @@ -124,9 +128,8 @@ def merge(self, other): self.definition = newdef return self - def to_bigquery(self, name, annotated=False, parent_category=None): - category = self.category if self.category else parent_category - fields = [v.to_bigquery(k, annotated=annotated, parent_category=category) \ + def to_bigquery(self, name): + fields = [v.to_bigquery(k) \ for (k,v) in sorted(self.definition.iteritems()) \ if not v.exclude_bigquery ] @@ -136,8 +139,14 @@ def to_bigquery(self, name, annotated=False, parent_category=None): "fields":fields, "mode":"REQUIRED" if self.required else "NULLABLE" } - if annotated and self.doc: - retv["doc"] = self.doc + return retv + + def docs_bq(self, parent_category=None): + retv = self._docs_common(parent_category=parent_category) + fields = { self.key_to_bq(k): v.docs_bq() \ + for (k,v) in sorted(self.definition.iteritems()) \ + if not v.exclude_bigquery } + retv["fields"] = fields return retv def print_indent_string(self, name, indent): @@ -192,15 +201,21 @@ def __init__(self, object_, subrecord_name, max_items=10, category=None): ListOf.__init__(self, object_, max_items, category=category) self.subrecord_name = subrecord_name - def to_bigquery(self, name, annotated=False, parent_category=None): + def to_bigquery(self, name): subr = SubRecord({ self.subrecord_name:ListOf(self.object_) }) - retv = subr.to_bigquery(self.key_to_bq(name), annotated=annotated) + retv = subr.to_bigquery(self.key_to_bq(name)) retv["mode"] = "REPEATED" - if annotated: - category = self.category if self.category else parent_category - retv["category"] = category + return retv + + def docs_bq(self, parent_category=None): + subr = SubRecord({ + self.subrecord_name: ListOf(self.object_) + }) + category = self.category if self.category else parent_category + retv = subr.docs_bq(parent_category=category) + retv["repeated"] = True return retv @@ -210,16 +225,20 @@ def to_es(self, name): return {name:SubRecord.to_es(self)} def docs_es(self, name, parent_category=None): - return {name: SubRecord.docs_es(self, parent_category=parent_category)} - - def to_bigquery(self, annotated=False, parent_category=None): category = self.category if self.category else parent_category + return {name: SubRecord.docs_es(self, parent_category=category)} + + def to_bigquery(self): source = sorted(self.definition.iteritems()) - return [s.to_bigquery(name, annotated=annotated, parent_category=category) \ + return [s.to_bigquery(name) \ for (name, s) in source \ if not s.exclude_bigquery ] + def docs_bq(self, name, parent_category=None): + category = self.category if self.category else parent_category + return {name: SubRecord.docs_bq(self, parent_category=category)} + def print_indent_string(self): for name, field in sorted(self.definition.iteritems()): field.print_indent_string(name, 0) diff --git a/zschema/leaves.py b/zschema/leaves.py index df9716d..c0c4dcb 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -96,26 +96,18 @@ def docs_es(self, parent_category=None): retv["type"] = self.ES_TYPE return retv - def to_bigquery(self, name, annotated=False, parent_category=None): + def docs_bq(self, parent_category=None): + retv = self._docs_common(parent_category) + retv["type"] = self.BQ_TYPE + return retv + + def to_bigquery(self, name): if not self._check_valid_name(name): raise Exception("Invalid field name: %s" % name) mode = "REQUIRED" if self.required else "NULLABLE" retv = {"name":self.key_to_bq(name), "type":self.BQ_TYPE, "mode":mode} if self.doc: retv["doc"] = self.doc - if annotated: - retv["detail_type"] = self.__class__.__name__ - category = self.category if self.category else parent_category - retv["category"] = category - if self.min_value: - retv["min_value"] = self.min_value - if self.max_value: - retv["max_value"] = self.max_value - if hasattr(self, "values_s") and len(self.values_s): - # gotta clean this up but for now... - retv["values"] = list(self.values_s) - else: - retv["examples"] = self.examples return retv def to_string(self, name): From 87952a277ca07c321a638a4c422a20ff5a4db2dd Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Tue, 10 Apr 2018 14:57:10 -0400 Subject: [PATCH 13/17] [minor] more concise Python h/t @andrewsardone --- zschema/compounds.py | 12 ++++++------ zschema/leaves.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/zschema/compounds.py b/zschema/compounds.py index 01e3ab4..4a7d502 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -36,7 +36,7 @@ def to_bigquery(self, name): def docs_bq(self, parent_category=None): retv = self.object_.docs_bq() - category = self.category if self.category else parent_category + category = self.category or parent_category retv["category"] = category retv["repeated"] = True return retv @@ -46,7 +46,7 @@ def to_es(self): def docs_es(self, parent_category=None): retv = self.object_.docs_es() - category = self.category if self.category else parent_category + category = self.category or parent_category retv["category"] = category return retv @@ -162,7 +162,7 @@ def to_es(self): return {"properties": p} def _docs_common(self, parent_category): - category = self.category if self.category else parent_category + category = self.category or parent_category retv = { "category": category, "doc": self.doc, @@ -213,7 +213,7 @@ def docs_bq(self, parent_category=None): subr = SubRecord({ self.subrecord_name: ListOf(self.object_) }) - category = self.category if self.category else parent_category + category = self.category or parent_category retv = subr.docs_bq(parent_category=category) retv["repeated"] = True return retv @@ -225,7 +225,7 @@ def to_es(self, name): return {name:SubRecord.to_es(self)} def docs_es(self, name, parent_category=None): - category = self.category if self.category else parent_category + category = self.category or parent_category return {name: SubRecord.docs_es(self, parent_category=category)} def to_bigquery(self): @@ -236,7 +236,7 @@ def to_bigquery(self): ] def docs_bq(self, name, parent_category=None): - category = self.category if self.category else parent_category + category = self.category or parent_category return {name: SubRecord.docs_bq(self, parent_category=category)} def print_indent_string(self): diff --git a/zschema/leaves.py b/zschema/leaves.py index c0c4dcb..b079630 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -80,7 +80,7 @@ def to_es(self): def _docs_common(self, parent_category): retv = { "detail_type": self.__class__.__name__, - "category": self.category if self.category else parent_category, + "category": self.category or parent_category, "doc": self.doc, "required": self.required, } From 69be48594d34e31b8b86d4b691c9c2a12bb63e86 Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Tue, 10 Apr 2018 15:35:03 -0400 Subject: [PATCH 14/17] Allow ListOf/NestedListOf to have `doc`s --- zschema/compounds.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/zschema/compounds.py b/zschema/compounds.py index 4a7d502..7a03d83 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -10,10 +10,11 @@ def _is_valid_object(name, object_): class ListOf(Keyable): - def __init__(self, object_, max_items=10, category=None): + def __init__(self, object_, max_items=10, doc=None, category=None): self.object_ = object_ self.max_items = max_items self.category = category + self.doc = doc _is_valid_object("Anonymous ListOf", object_) @property @@ -39,6 +40,8 @@ def docs_bq(self, parent_category=None): category = self.category or parent_category retv["category"] = category retv["repeated"] = True + if self.doc: + retv["doc"] = self.doc return retv def to_es(self): @@ -48,6 +51,8 @@ def docs_es(self, parent_category=None): retv = self.object_.docs_es() category = self.category or parent_category retv["category"] = category + if self.doc: + retv["doc"] = self.doc return retv def validate(self, name, value): @@ -197,8 +202,8 @@ def validate(self, name, value): class NestedListOf(ListOf): - def __init__(self, object_, subrecord_name, max_items=10, category=None): - ListOf.__init__(self, object_, max_items, category=category) + def __init__(self, object_, subrecord_name, max_items=10, doc=None, category=None): + ListOf.__init__(self, object_, max_items, doc=doc, category=category) self.subrecord_name = subrecord_name def to_bigquery(self, name): @@ -207,6 +212,8 @@ def to_bigquery(self, name): }) retv = subr.to_bigquery(self.key_to_bq(name)) retv["mode"] = "REPEATED" + if self.doc: + retv["doc"] = self.doc return retv def docs_bq(self, parent_category=None): @@ -216,6 +223,8 @@ def docs_bq(self, parent_category=None): category = self.category or parent_category retv = subr.docs_bq(parent_category=category) retv["repeated"] = True + if self.doc: + retv["doc"] = self.doc return retv From 1b0cfb7fc74340ec6965db50111e7dbaa2385283 Mon Sep 17 00:00:00 2001 From: Chris Dzombak Date: Tue, 10 Apr 2018 15:35:18 -0400 Subject: [PATCH 15/17] Indicate when an ES field is a list in doc output --- zschema/compounds.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zschema/compounds.py b/zschema/compounds.py index 7a03d83..cf3a245 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -51,6 +51,7 @@ def docs_es(self, parent_category=None): retv = self.object_.docs_es() category = self.category or parent_category retv["category"] = category + retv["repeated"] = True if self.doc: retv["doc"] = self.doc return retv From eaef7020327f2fb850da68ad495f4b65a51987f5 Mon Sep 17 00:00:00 2001 From: Andrew Sardone Date: Tue, 10 Apr 2018 16:10:06 -0400 Subject: [PATCH 16/17] Add docs to tests schema This is so we can assert on the format when testing our new "docs" output. This also required updating the BigQuery inline fixture. Co-authored-by: Chris Dzombak --- zschema/tests.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/zschema/tests.py b/zschema/tests.py index 85855d6..31769d4 100644 --- a/zschema/tests.py +++ b/zschema/tests.py @@ -115,6 +115,7 @@ def test_invalid(self): { "type": "INTEGER", "name": "ip", + "doc": "The IP Address of the host", "mode": "NULLABLE" }, ] @@ -154,16 +155,16 @@ def setUp(self): heartbleed = SubRecord({ "heartbeat_support":Boolean(), - "heartbleed_vulnerable":Boolean(), + "heartbleed_vulnerable":Boolean(category="Vulnerabilities"), "timestamp":DateTime() }) self.host = Record({ - "ipstr":IPv4Address(required=True), - "ip":Long(), + "ipstr":IPv4Address(required=True, examples=["8.8.8.8"]), + "ip":Long(doc="The IP Address of the host"), Port(443):SubRecord({ "tls":String(), "heartbleed":heartbleed - }), + }, category="heartbleed"), "tags":ListOf(String()) }) From 62a58a8ecf50c0208b5d4bde95a2f54b73aeb893 Mon Sep 17 00:00:00 2001 From: Andrew Sardone Date: Tue, 10 Apr 2018 16:13:48 -0400 Subject: [PATCH 17/17] =?UTF-8?q?Add=20minimal=20assertion=20tests=20aroun?= =?UTF-8?q?d=20new=20=E2=80=9Cdocs=E2=80=9D=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Dzombak --- zschema/tests.py | 181 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) diff --git a/zschema/tests.py b/zschema/tests.py index 31769d4..b222f7b 100644 --- a/zschema/tests.py +++ b/zschema/tests.py @@ -67,6 +67,178 @@ def test_invalid(self): } } +VALID_DOCS_OUTPUT_FOR_ES_FIELDS = { + "host": { + "category": None, + "doc": None, + "fields": { + "443": { + "category": "heartbleed", + "doc": None, + "fields": { + "heartbleed": { + "category": None, + "doc": None, + "fields": { + "heartbeat_support": { + "category": None, + "detail_type": "Boolean", + "doc": None, + "examples": [], + "required": False, + "type": "boolean" + }, + "heartbleed_vulnerable": { + "category": "Vulnerabilities", + "detail_type": "Boolean", + "doc": None, + "examples": [], + "required": False, + "type": "boolean" + }, + "timestamp": { + "category": None, + "detail_type": "DateTime", + "doc": None, + "examples": [], + "required": False, + "type": "date" + } + }, + "required": False, + "type": "SubRecord" + }, + "tls": { + "category": None, + "detail_type": "String", + "doc": None, + "examples": [], + "required": False, + "type": "keyword" + } + }, + "required": False, + "type": "SubRecord" + }, + "ip": { + "category": None, + "detail_type": "Long", + "doc": "The IP Address of the host", + "examples": [], + "required": False, + "type": "long" + }, + "ipstr": { + "category": None, + "detail_type": "IPv4Address", + "doc": None, + "examples": [ + "8.8.8.8" + ], + "required": True, + "type": "ip" + }, + "tags": { + "category": None, + "detail_type": "String", + "doc": None, + "examples": [], + "repeated": True, + "required": False, + "type": "keyword" + } + }, + "required": False, + "type": "Record" + } +} + +VALID_DOCS_OUTPUT_FOR_BIG_QUERY_FIELDS = { + "host": { + "category": None, + "doc": None, + "fields": { + "ip": { + "category": None, + "detail_type": "Long", + "doc": "The IP Address of the host", + "examples": [], + "required": False, + "type": "INTEGER" + }, + "ipstr": { + "category": None, + "detail_type": "IPv4Address", + "doc": None, + "examples": [ + "8.8.8.8" + ], + "required": True, + "type": "STRING" + }, + "p443": { + "category": "heartbleed", + "doc": None, + "fields": { + "heartbleed": { + "category": None, + "doc": None, + "fields": { + "heartbeat_support": { + "category": None, + "detail_type": "Boolean", + "doc": None, + "examples": [], + "required": False, + "type": "BOOLEAN" + }, + "heartbleed_vulnerable": { + "category": "Vulnerabilities", + "detail_type": "Boolean", + "doc": None, + "examples": [], + "required": False, + "type": "BOOLEAN" + }, + "timestamp": { + "category": None, + "detail_type": "DateTime", + "doc": None, + "examples": [], + "required": False, + "type": "DATETIME" + } + }, + "required": False, + "type": "SubRecord" + }, + "tls": { + "category": None, + "detail_type": "String", + "doc": None, + "examples": [], + "required": False, + "type": "STRING" + } + }, + "required": False, + "type": "SubRecord" + }, + "tags": { + "category": None, + "detail_type": "String", + "doc": None, + "examples": [], + "repeated": True, + "required": False, + "type": "STRING" + } + }, + "required": False, + "type": "Record" + } +} + VALID_BIG_QUERY = [ { "fields": [ @@ -178,6 +350,15 @@ def test_elasticsearch(self): r = self.host.to_es("host") self.assertEqual(r, VALID_ELASTIC_SEARCH) + def test_docs_output(self): + global VALID_DOCS_OUTPUT_FOR_ES_FIELDS + r = self.host.docs_es("host") + self.assertEqual(r, VALID_DOCS_OUTPUT_FOR_ES_FIELDS) + + global VALID_DOCS_OUTPUT_FOR_BIG_QUERY_FIELDS + r = self.host.docs_bq("host") + self.assertEqual(r, VALID_DOCS_OUTPUT_FOR_BIG_QUERY_FIELDS) + def test_validation_known_good(self): test = { "ipstr":"141.212.120.1",