From 4e6db8d7a93273ab3e6db1af6170e2e4019b65d3 Mon Sep 17 00:00:00 2001 From: vmonakhov Date: Tue, 25 Jun 2024 12:42:10 +0300 Subject: [PATCH] Better parser data -- https://github.com/ispras/lingvodoc-react/issues/1120 (#1508) * init * get paragraph id * get dedoc data * text from dedoc * full results structure * fixes * fixes * undo doc_parser.py * handling several bold words * fix * next * fixes * better bold font and refactoring * refactoring * save to_json * most correct version * use json any way * cleanup * cleanup * next changes * some fixes * json_to_html * next steps * fixes after testing * fixes * fixed for strange parsers --- lingvodoc/schema/gql_parserresult.py | 8 ++ lingvodoc/schema/query.py | 20 ++-- lingvodoc/utils/creation.py | 135 +++++++++++++++++++++++++-- 3 files changed, 150 insertions(+), 13 deletions(-) diff --git a/lingvodoc/schema/gql_parserresult.py b/lingvodoc/schema/gql_parserresult.py index 7ede31e6..94b3d04e 100644 --- a/lingvodoc/schema/gql_parserresult.py +++ b/lingvodoc/schema/gql_parserresult.py @@ -98,6 +98,7 @@ from lingvodoc.utils.creation import ( create_parser_result, + json_to_html, async_create_parser_result) import lingvodoc.utils.doc_parser as ParseMethods @@ -108,6 +109,8 @@ as_storage_file, storage_file) +from pdb import set_trace as A + # Setting up logging. log = logging.getLogger(__name__) @@ -444,6 +447,7 @@ class Arguments: id = LingvodocID(required=True) element_id = graphene.String() content = graphene.String() + content_fmt = graphene.String() reexecute = graphene.Boolean() synchronous = graphene.Boolean() @@ -455,6 +459,10 @@ def mutate(root, info, **args): parser_result_id = args.get('id') element_id = args.get('element_id') content_from_args = args.get('content') + content_fmt = args.get('content_fmt', 'html') + # incoming content may have json format + if len(content_from_args or []) > 0 and content_fmt == 'json': + content_from_args = json_to_html(json.loads(content_from_args)) reexecute = args.get('reexecute') synchronous = args.get('synchronous', False) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 6c5a419d..b9e32148 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -313,6 +313,7 @@ from lingvodoc.utils.creation import ( create_entity, + get_result_json, edit_role) from lingvodoc.utils.elan_functions import tgt_to_eaf @@ -596,7 +597,7 @@ class Query(graphene.ObjectType): client_id_list = graphene.List(graphene.Int, required = True))) parser_results = graphene.Field((graphene.List(ParserResult)), entity_id = LingvodocID(), parser_id=LingvodocID()) - parser_result = graphene.Field(ParserResult, id=LingvodocID()) + parser_result = graphene.Field(ParserResult, id=LingvodocID(), exact_fmt=graphene.String()) parsers = graphene.Field(graphene.List(Parser)) unstructured_data = ( @@ -4782,13 +4783,20 @@ def resolve_parser_results(self, info, entity_id): return_list.append(new_parser_result) return return_list - def resolve_parser_result(self, info, id): + def resolve_parser_result(self, info, id, exact_fmt='html'): client_id, object_id = id - result = DBSession.query(dbParserResult).filter_by(client_id=client_id, - object_id=object_id, - ).first() - if not result or result.marked_for_deletion: + result_orig = DBSession.query(dbParserResult).filter_by(client_id=client_id, + object_id=object_id, + ).first() + if not result_orig or result_orig.marked_for_deletion: return None + + if exact_fmt == 'json': + result = copy.copy(result_orig) + result.content = get_result_json(result.content) + else: + result = result_orig + parser_result = ParserResult(id=[result.client_id, result.object_id]) parser_result.dbObject = result return parser_result diff --git a/lingvodoc/utils/creation.py b/lingvodoc/utils/creation.py index 9b98cdf8..8eaa3f10 100644 --- a/lingvodoc/utils/creation.py +++ b/lingvodoc/utils/creation.py @@ -2,7 +2,9 @@ # Standard library imports. import base64 +import collections import hashlib +import json import logging import os import random @@ -12,6 +14,8 @@ import string import time import urllib +from bs4 import BeautifulSoup +from bs4.element import Tag # Library imports. @@ -60,6 +64,7 @@ from lingvodoc.utils.search import translation_gist_search from lingvodoc.views.v2.utils import storage_file +from pdb import set_trace as A # Setting up logging. @@ -520,9 +525,124 @@ def async_create_parser_result_method(id, parser_id, entity_id, apertium_path, s task_status.set(2, 100, "Parsing of file " + content_filename + " finished") -# Downloads a document by the URL in an entity's content and saves the result of its parsing + +def json_to_html(content): + + html_output = BeautifulSoup() + attrs = {'id': 'id', 'state': 'class'} + + for prg in content: + p_tag = html_output.new_tag("p") + for wrd in prg: + # if word has some attributes + if type(wrd) is dict: + w_span_tag = html_output.new_tag("span") + for key, attr in attrs.items(): + if key in wrd: + w_span_tag[attr] = wrd[key] + + # iterate by result spans + for res in wrd.get('results', []): + r_span_tag = html_output.new_tag("span") + for key, attr in attrs.items(): + r_span_tag[attr] = res.get(key) + + data = {k: v for k, v in res.items() if k not in attrs} + r_span_tag.append(json.dumps(data)) + + w_span_tag.append(r_span_tag) + + w_span_tag.append(wrd.get('text', "")) + + # wrap w_span_tag in prefix tags if any + for prefix in wrd.get('prefix', []): + pfx_tag = html_output.new_tag(prefix) + pfx_tag.append(w_span_tag) + w_span_tag = pfx_tag + + # append word to paragraph + p_tag.append(w_span_tag) + + elif type(wrd) is str: + p_tag.append(wrd) + + html_output.append(p_tag) + + return str(html_output) + + +def get_result_json(annotated_html): + parags_list = [] + # iteration by

tags + for parag in BeautifulSoup(annotated_html, 'html.parser')('p'): + words_list = [] + # iteration by for single words + # or by some other tags e.g. with several words inside + # or by simple text parts without any tag + for note in parag.contents: + prefix = [] + + def f(tag): + nonlocal prefix + annots = [] + id = state = None + word_dict = collections.defaultdict(list) + + if type(tag) is Tag: + + while tag.name != 'span': + prefix.append(tag.name) + # calling f() recursively because several words + # may be inside e.g. tags + for t in tag.contents: + f(t) + return + + id = tag.get('id') + state = tag.get('class') + annots = tag.contents + text = "" + for i, t in enumerate(tag.contents): + if type(t) is not Tag: + annots.pop(i) + text += t + tag = text + + # last tag from the loop above should be a textual string + word_dict['text'] = str(tag) + + item_to_store = word_dict['text'] + if id and state: + word_dict['id'] = id + word_dict['state'] = state + if type(word_dict['state']) == list: + word_dict['state'] = ' '.join(word_dict['state']) + word_dict['results'] = [] + item_to_store = word_dict + if prefix: + word_dict['prefix'] = prefix + item_to_store = word_dict + + # last annots from the loop above should contain results list + for ann in annots: + if type(ann) is Tag: + res = json.loads(ann.text) + res['id'] = ann.get('id') + res['state'] = ann.get('class') + if type(res['state'] == list): + res['state'] = ' '.join(res['state']) + word_dict['results'].append(res) + + words_list.append(item_to_store) + + f(note) + + parags_list.append(words_list) + + return json.dumps(parags_list) +# Downloads a document by the URL in an entity's content and saves the result of its parsing def create_parser_result( id, parser_id, entity_id, dedoc_url, apertium_path, storage, arguments = None, save_object = True): @@ -547,20 +667,21 @@ def create_parser_result( os.path.basename(urllib.parse.urlparse(entity.content).path), source_stream)} - data = {'return_html': True} - - r = requests.post(url=dedoc_url, files=files, data=data) + r = requests.post(url=dedoc_url, files=files, data={'return_html': True}) dedoc_output = re.sub(r"(.*?)", "", r.content.decode('utf-8')) - if parser.method.find("apertium") != -1: - result = parse_method(dedoc_output, apertium_path, **arguments) - else: + # we get result as html + if "timarkh" in parser.method: result = parse_method(dedoc_output, **arguments) + elif "apertium" in parser.method: + result = parse_method(dedoc_output, apertium_path, **arguments) + dbparserresult = ParserResult(client_id=client_id, object_id=object_id, parser_object_id=parser_object_id, parser_client_id=parser_client_id, entity_client_id=entity_client_id, entity_object_id=entity_object_id, arguments=arguments, content=result) + if not dbparserresult.object_id: dbparserresult.object_id = get_client_counter(client_id) if save_object: