Skip to content

Commit

Permalink
Better parser data -- ispras/lingvodoc-react#1120 (#1508)
Browse files Browse the repository at this point in the history
* init

* get paragraph id

* get dedoc data

* text from dedoc

* full results structure

* fixes

* fixes

* undo doc_parser.py

* handling several bold words

* fix

* next

* fixes

* better bold font and refactoring

* refactoring

* save to_json

* most correct version

* use json any way

* cleanup

* cleanup

* next changes

* some fixes

* json_to_html

* next steps

* fixes after testing

* fixes

* fixed for strange parsers
  • Loading branch information
vmonakhov authored Jun 25, 2024
1 parent a88aa84 commit 4e6db8d
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 13 deletions.
8 changes: 8 additions & 0 deletions lingvodoc/schema/gql_parserresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@

from lingvodoc.utils.creation import (
create_parser_result,
json_to_html,
async_create_parser_result)

import lingvodoc.utils.doc_parser as ParseMethods
Expand All @@ -108,6 +109,8 @@
as_storage_file,
storage_file)

from pdb import set_trace as A


# Setting up logging.
log = logging.getLogger(__name__)
Expand Down Expand Up @@ -444,6 +447,7 @@ class Arguments:
id = LingvodocID(required=True)
element_id = graphene.String()
content = graphene.String()
content_fmt = graphene.String()
reexecute = graphene.Boolean()
synchronous = graphene.Boolean()

Expand All @@ -455,6 +459,10 @@ def mutate(root, info, **args):
parser_result_id = args.get('id')
element_id = args.get('element_id')
content_from_args = args.get('content')
content_fmt = args.get('content_fmt', 'html')
# incoming content may have json format
if len(content_from_args or []) > 0 and content_fmt == 'json':
content_from_args = json_to_html(json.loads(content_from_args))
reexecute = args.get('reexecute')
synchronous = args.get('synchronous', False)

Expand Down
20 changes: 14 additions & 6 deletions lingvodoc/schema/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@

from lingvodoc.utils.creation import (
create_entity,
get_result_json,
edit_role)

from lingvodoc.utils.elan_functions import tgt_to_eaf
Expand Down Expand Up @@ -596,7 +597,7 @@ class Query(graphene.ObjectType):
client_id_list = graphene.List(graphene.Int, required = True)))
parser_results = graphene.Field((graphene.List(ParserResult)),
entity_id = LingvodocID(), parser_id=LingvodocID())
parser_result = graphene.Field(ParserResult, id=LingvodocID())
parser_result = graphene.Field(ParserResult, id=LingvodocID(), exact_fmt=graphene.String())
parsers = graphene.Field(graphene.List(Parser))

unstructured_data = (
Expand Down Expand Up @@ -4782,13 +4783,20 @@ def resolve_parser_results(self, info, entity_id):
return_list.append(new_parser_result)
return return_list

def resolve_parser_result(self, info, id):
def resolve_parser_result(self, info, id, exact_fmt='html'):
client_id, object_id = id
result = DBSession.query(dbParserResult).filter_by(client_id=client_id,
object_id=object_id,
).first()
if not result or result.marked_for_deletion:
result_orig = DBSession.query(dbParserResult).filter_by(client_id=client_id,
object_id=object_id,
).first()
if not result_orig or result_orig.marked_for_deletion:
return None

if exact_fmt == 'json':
result = copy.copy(result_orig)
result.content = get_result_json(result.content)
else:
result = result_orig

parser_result = ParserResult(id=[result.client_id, result.object_id])
parser_result.dbObject = result
return parser_result
Expand Down
135 changes: 128 additions & 7 deletions lingvodoc/utils/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# Standard library imports.

import base64
import collections
import hashlib
import json
import logging
import os
import random
Expand All @@ -12,6 +14,8 @@
import string
import time
import urllib
from bs4 import BeautifulSoup
from bs4.element import Tag

# Library imports.

Expand Down Expand Up @@ -60,6 +64,7 @@
from lingvodoc.utils.search import translation_gist_search

from lingvodoc.views.v2.utils import storage_file
from pdb import set_trace as A


# Setting up logging.
Expand Down Expand Up @@ -520,9 +525,124 @@ def async_create_parser_result_method(id, parser_id, entity_id, apertium_path, s

task_status.set(2, 100, "Parsing of file " + content_filename + " finished")

# Downloads a document by the URL in an entity's content and saves the result of its parsing

def json_to_html(content):

html_output = BeautifulSoup()
attrs = {'id': 'id', 'state': 'class'}

for prg in content:
p_tag = html_output.new_tag("p")
for wrd in prg:
# if word has some attributes
if type(wrd) is dict:
w_span_tag = html_output.new_tag("span")
for key, attr in attrs.items():
if key in wrd:
w_span_tag[attr] = wrd[key]

# iterate by result spans
for res in wrd.get('results', []):
r_span_tag = html_output.new_tag("span")
for key, attr in attrs.items():
r_span_tag[attr] = res.get(key)

data = {k: v for k, v in res.items() if k not in attrs}
r_span_tag.append(json.dumps(data))

w_span_tag.append(r_span_tag)

w_span_tag.append(wrd.get('text', ""))

# wrap w_span_tag in prefix tags if any
for prefix in wrd.get('prefix', []):
pfx_tag = html_output.new_tag(prefix)
pfx_tag.append(w_span_tag)
w_span_tag = pfx_tag

# append word to paragraph
p_tag.append(w_span_tag)

elif type(wrd) is str:
p_tag.append(wrd)

html_output.append(p_tag)

return str(html_output)


def get_result_json(annotated_html):
parags_list = []
# iteration by <p></p> tags
for parag in BeautifulSoup(annotated_html, 'html.parser')('p'):
words_list = []
# iteration by <span></span> for single words
# or by some other tags e.g. <b></b> with several words inside
# or by simple text parts without any tag
for note in parag.contents:
prefix = []

def f(tag):
nonlocal prefix
annots = []
id = state = None
word_dict = collections.defaultdict(list)

if type(tag) is Tag:

while tag.name != 'span':
prefix.append(tag.name)
# calling f() recursively because several words
# may be inside e.g. <b></b> tags
for t in tag.contents:
f(t)
return

id = tag.get('id')
state = tag.get('class')
annots = tag.contents
text = ""
for i, t in enumerate(tag.contents):
if type(t) is not Tag:
annots.pop(i)
text += t
tag = text

# last tag from the loop above should be a textual string
word_dict['text'] = str(tag)

item_to_store = word_dict['text']
if id and state:
word_dict['id'] = id
word_dict['state'] = state
if type(word_dict['state']) == list:
word_dict['state'] = ' '.join(word_dict['state'])
word_dict['results'] = []
item_to_store = word_dict
if prefix:
word_dict['prefix'] = prefix
item_to_store = word_dict

# last annots from the loop above should contain results list
for ann in annots:
if type(ann) is Tag:
res = json.loads(ann.text)
res['id'] = ann.get('id')
res['state'] = ann.get('class')
if type(res['state'] == list):
res['state'] = ' '.join(res['state'])
word_dict['results'].append(res)

words_list.append(item_to_store)

f(note)

parags_list.append(words_list)

return json.dumps(parags_list)


# Downloads a document by the URL in an entity's content and saves the result of its parsing
def create_parser_result(
id, parser_id, entity_id, dedoc_url, apertium_path, storage, arguments = None, save_object = True):

Expand All @@ -547,20 +667,21 @@ def create_parser_result(
os.path.basename(urllib.parse.urlparse(entity.content).path),
source_stream)}

data = {'return_html': True}

r = requests.post(url=dedoc_url, files=files, data=data)
r = requests.post(url=dedoc_url, files=files, data={'return_html': True})
dedoc_output = re.sub(r"(<sub>.*?</sub>)", "", r.content.decode('utf-8'))

if parser.method.find("apertium") != -1:
result = parse_method(dedoc_output, apertium_path, **arguments)
else:
# we get result as html
if "timarkh" in parser.method:
result = parse_method(dedoc_output, **arguments)

elif "apertium" in parser.method:
result = parse_method(dedoc_output, apertium_path, **arguments)

dbparserresult = ParserResult(client_id=client_id, object_id=object_id,
parser_object_id=parser_object_id, parser_client_id=parser_client_id,
entity_client_id=entity_client_id, entity_object_id=entity_object_id,
arguments=arguments, content=result)

if not dbparserresult.object_id:
dbparserresult.object_id = get_client_counter(client_id)
if save_object:
Expand Down

0 comments on commit 4e6db8d

Please sign in to comment.