Skip to content

Commit

Permalink
Adverb statistics -- ispras/lingvodoc-react#1128 (#1509)
Browse files Browse the repository at this point in the history
* all_changes

* cleanup

* fixes for delete
  • Loading branch information
vmonakhov authored Jul 9, 2024
1 parent f876b39 commit 4ee7cd2
Show file tree
Hide file tree
Showing 6 changed files with 3,392 additions and 1,667 deletions.
32 changes: 32 additions & 0 deletions alembic/versions/272921300885_add_hash_adverb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""add_hash_adverb
Revision ID: 483f9330348c
Revises: 9a82fe69ceee
Create Date: 2023-04-03 17:07:49.155801
"""

# revision identifiers, used by Alembic.
revision = '483f9330348c'
down_revision = '9a82fe69ceee'
branch_labels = None
depends_on = None

from alembic import op
import sqlalchemy as sa


def upgrade():
op.execute('''
ALTER TABLE valency_parser_data
ADD hash_adverb TEXT;
UPDATE valency_parser_data
SET hash_adverb = ''
WHERE hash_adverb IS NULL;
ALTER TABLE valency_parser_data
ALTER COLUMN hash_adverb SET NOT NULL;
''')

def downgrade():
op.execute('''
ALTER TABLE valency_parser_data
DROP COLUMN hash_adverb;
''')
50 changes: 50 additions & 0 deletions alembic/versions/7c6124cdd1f6_adverb_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Adverb data
Revision ID: 9a82fe69ceee
Revises: be06149acd44
Create Date: 2023-03-22 21:47:10.366535
"""

# revision identifiers, used by Alembic.
revision = '9a82fe69ceee'
down_revision = 'be06149acd44'
branch_labels = None
depends_on = None

from alembic import op
import sqlalchemy as sa


def upgrade():

op.execute('''
CREATE TABLE adverb_instance_data (
id BIGSERIAL PRIMARY KEY,
sentence_id BIGINT NOT NULL REFERENCES valency_sentence_data(id),
index INT NOT NULL,
adverb_lex TEXT NOT NULL,
case_str TEXT NOT NULL
);
CREATE TABLE adverb_annotation_data (
instance_id BIGINT NOT NULL REFERENCES adverb_instance_data(id),
user_id BIGINT NOT NULL REFERENCES public.user(id),
accepted BOOLEAN DEFAULT null,
PRIMARY KEY (instance_id, user_id)
);
CREATE INDEX adverb_instance_data_sentence_id_index
ON adverb_instance_data (sentence_id);
CREATE INDEX adverb_instance_data_adverb_lex_index
ON adverb_instance_data (adverb_lex);
CREATE INDEX adverb_instance_data_case_str_index
ON adverb_instance_data (case_str);
''')


def downgrade():

op.execute('''
DROP INDEX adverb_instance_data_sentence_id_index CASCADE;
DROP INDEX adverb_instance_data_adverb_lex_index CASCADE;
DROP INDEX adverb_instance_data_case_str_index CASCADE;
DROP TABLE adverb_instance_data CASCADE;
DROP TABLE adverb_annotation_data CASCADE;
''')
23 changes: 22 additions & 1 deletion lingvodoc/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2733,7 +2733,7 @@ def __table_args__(cls):
super().__table_args__)

hash = Column(UnicodeText)

hash_adverb = Column(UnicodeText)

class ValencyEafData(
Base,
Expand Down Expand Up @@ -2799,3 +2799,24 @@ class ValencyMergeData(
verb_lex = Column(UnicodeText, nullable = False, primary_key = True)
merge_id = Column(SLBigInteger(), nullable = False)


class AdverbInstanceData(
Base,
IdMixin):

__tablename__ = 'adverb_instance_data'

sentence_id = Column(SLBigInteger(), ForeignKey('valency_sentence_data.id'), nullable = False)
index = Column(Integer(), nullable = False)
adverb_lex = Column(UnicodeText, nullable = False)
case_str = Column(UnicodeText, nullable = False)


class AdverbAnnotationData(
Base):

__tablename__ = 'adverb_annotation_data'

instance_id = Column(SLBigInteger(), ForeignKey('adverb_instance_data.id'), primary_key = True)
user_id = Column(SLBigInteger(), ForeignKey('user.id'), primary_key = True)
accepted = Column(Boolean, default = None)
116 changes: 115 additions & 1 deletion lingvodoc/schema/gql_dictionaryperspective.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@
user_to_group_association,
ValencyEafData as dbValencyEafData,
ValencyParserData as dbValencyParserData,
ValencySourceData as dbValencySourceData)
ValencySourceData as dbValencySourceData,
ValencySentenceData as dbValencySentenceData,
AdverbInstanceData as dbAdverbInstanceData)

from lingvodoc.schema.gql_column import Column
from lingvodoc.schema.gql_dictionary import Dictionary
Expand Down Expand Up @@ -239,7 +241,9 @@ class DictionaryPerspective(LingvodocObjectType):

is_hidden_for_client = graphene.Boolean()
has_valency_data = graphene.Boolean()
has_adverb_data = graphene.Boolean()
new_valency_data_count = graphene.Int()
new_adverb_data_count = graphene.Int()

dbType = dbPerspective

Expand Down Expand Up @@ -566,6 +570,30 @@ def resolve_has_valency_data(self, info):
.query(exists_query)
.scalar())

def resolve_has_adverb_data(self, info):
"""
If the perspective has adverb annotation data.
"""

exists_query = (
DBSession

.query(
literal(1))

.filter(
dbValencySourceData.perspective_client_id == self.id[0],
dbValencySourceData.perspective_object_id == self.id[1],
dbValencyParserData.id == dbValencySourceData.id,
dbValencyParserData.hash_adverb != '')

.exists())

return (
DBSession
.query(exists_query)
.scalar())

def resolve_new_valency_data_count(self, info):
"""
How many unprocessed valency sources perspective has.
Expand Down Expand Up @@ -703,6 +731,92 @@ def resolve_new_valency_data_count(self, info):

return new_hash_count

def resolve_new_adverb_data_count(self, info):
"""
How many unprocessed adverb sources perspective has.
"""

debug_flag = False

ready_hash_subquery = (
DBSession

.query(

func.encode(
func.digest(
dbParserResult.content, 'sha256'),
'hex')

.label('hash'))

.filter(
dbLexicalEntry.parent_client_id == self.id[0],
dbLexicalEntry.parent_object_id == self.id[1],
dbLexicalEntry.marked_for_deletion == False,
dbEntity.parent_client_id == dbLexicalEntry.client_id,
dbEntity.parent_object_id == dbLexicalEntry.object_id,
dbEntity.marked_for_deletion == False,
dbPublishingEntity.client_id == dbEntity.client_id,
dbPublishingEntity.object_id == dbEntity.object_id,
dbPublishingEntity.published == True,
dbPublishingEntity.accepted == True,
dbParserResult.entity_client_id == dbEntity.client_id,
dbParserResult.entity_object_id == dbEntity.object_id,
dbParserResult.marked_for_deletion == False)

.subquery())

ready_hash_count = (
DBSession
.query(ready_hash_subquery)
.count())

has_hash_subquery = (
DBSession

.query(
dbValencyParserData.hash_adverb)

.filter(
dbValencySourceData.perspective_client_id == self.id[0],
dbValencySourceData.perspective_object_id == self.id[1],
dbValencyParserData.id == dbValencySourceData.id,
dbValencyParserData.hash_adverb != '')

.subquery())

has_hash_count = (
DBSession
.query(has_hash_subquery)
.count())

if debug_flag:
log.debug(
f'ready_hash_count: {ready_hash_count}\n'
f'has_hash_count: {has_hash_count}')

new_hash_count = (
DBSession

.query(
ready_hash_subquery.c.hash)

.filter(
ready_hash_subquery.c.hash.notin_(
has_hash_subquery))

.count())

if debug_flag:

log.debug(
f'new_hash_count: {new_hash_count}')

# Actually here we answer if database has sources with old hash_adverbs,
# with wrong (maybe deleted) related parser results or duplicate sources
return new_hash_count + (has_hash_count > ready_hash_count)

@fetch_object()
def resolve_lexical_entries(self, info, ids=None, mode=None, authors=None, clients=None, start_date=None, end_date=None,
position=1):
Expand Down
Loading

0 comments on commit 4ee7cd2

Please sign in to comment.