Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into mwestats
Browse files Browse the repository at this point in the history
  • Loading branch information
oktaal committed Mar 8, 2024
2 parents 050b1c0 + 8d89be8 commit 8819941
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 7 deletions.
29 changes: 29 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# This CITATION.cff file was generated with cffinit.
# Visit https://bit.ly/cffinit to generate yours today!

cff-version: 1.2.0
title: MWE Query
message: >-
If you use this software, please cite it using the
metadata from this file.
type: software
authors:
- given-names: Jan
family-names: Odijk
affiliation: Utrecht University
- given-names: Martin
family-names: Kroon
affiliation: Utrecht University
orcid: 'https://orcid.org/0000-0003-3059-6872'
- name: >-
Research Software Lab, Centre for Digital Humanities,
Utrecht University
website: >-
https://cdh.uu.nl/centre-for-digital-humanities/research-software-lab/
city: Utrecht
country: NL
identifiers:
- type: doi
value: 10.5281/zenodo.10410636
repository-code: 'https://github.com/UUDigitalHumanitieslab/mwe-query'
license: BSD-3-Clause
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# MWE Query
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10410636.svg)](https://doi.org/10.5281/zenodo.10410636)
[![PyPI version](https://badge.fury.io/py/mwe-query.svg)](https://badge.fury.io/py/mwe-query)
[![Actions Status](https://github.com/UUDigitalHumanitiesLab/mwe-query/workflows/Tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/mwe-query/actions)

# MWE Query

## Run Locally

Expand Down
23 changes: 18 additions & 5 deletions mwe_query/canonicalform.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,14 @@ def mknewnode(stree: SynTree, mwetop: int, atts: List[str], annotations: List[in
newnode.attrib['maxnodecount'] = f'{len(stree)}'
return newnode


def expandnonheadwordnode(nonheadwordnode, phrasenodeproperties):
phraserel = gav(nonheadwordnode, 'rel')
newnonheadwordnode = copy.copy(nonheadwordnode)
newnonheadwordnode.attrib['rel'] = 'hd'
phrasenode = ET.Element('node', attrib=phrasenodeproperties)
phrasenode.attrib['rel'] = phraserel
phrasenode.append(newnonheadwordnode)
return phrasenode
def zullenheadclause(stree: SynTree) -> bool:
if stree.tag == 'node':
cat = gav(stree, 'cat')
Expand Down Expand Up @@ -1038,9 +1045,10 @@ def newgenvariants(stree: SynTree) -> List[SynTree]:
Rpronounobj1node = copy.copy(obj1node)
Rpronounobj1node.attrib['lemma'] = 'er|hier|daar|waar|ergens|nergens|overal'
Rpronounobj1node.attrib['pt'] = 'vnw'
newphrase = expandnonheadwordnode(Rpronounobj1node, {})
for child in newppnode2:
newppnode2.remove(child)
newppnode2.append(Rpronounobj1node)
newppnode2.append(newphrase)
newppnode2.append(newvz2)

# pp with R-pronoun object which has been replaced by a full NO with a dummymod
Expand Down Expand Up @@ -1069,11 +1077,15 @@ def newgenvariants(stree: SynTree) -> List[SynTree]:
pppronadvvcnode.append(pronadvnode1)
pppronadvvcnode.append(newvcnode)

# pp's with a pronominal adverb. e.g. daarnaar
pprel = gav(ppnode, 'rel')
pronadvnode = getpronadv(vzlemma, pprel)
pronadvppnode = expandnonheadwordnode(pronadvnode, {'cat': 'pp', 'rel': pprel})
pronadvnode.attrib['rel'] = 'hd'
pronadvppnode.append(pronadvnode)

alternativesnode = mkalternativesnode([[ppnode], [newppnode2], [newppnode3], [
pppobj1vcnode], [pppronadvvcnode], [pronadvnode]])
pppobj1vcnode], [pppronadvvcnode], [pronadvppnode]])
parent.append(alternativesnode)

vblgennpnodeids = xpath_values(
Expand Down Expand Up @@ -1429,8 +1441,9 @@ def relpronsubst(stree: SynTree) -> SynTree:
def expandfull(stree: SynTree) -> SynTree:
# possibly add getlcat
stree1 = relpronsubst(stree)
stree2 = indextransform(stree1)
return stree2
stree2 = expandnonheadwords(stree1)
stree3 = indextransform(stree2)
return stree3


def gettopnode(stree):
Expand Down
7 changes: 6 additions & 1 deletion mwe_query/lcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import copy
import lxml.etree as ET

dummy = 'dummy'


def expandnonheadwords(stree: SynTree) -> SynTree:
# it is presupposed that the input stree is not None
Expand Down Expand Up @@ -48,7 +50,8 @@ def getlcatatt(node: SynTree) -> str:

def mkphrase(child: SynTree) -> SynTree:
newnode = ET.Element('node')
newnode.attrib['id'] = str(child.attrib['id']) + 'a'
if 'íd' in child.attrib:
newnode.attrib['id'] = str(child.attrib['id']) + 'a'
lcat = getlcatatt(child)
if lcat in validcats:
newnode.attrib['cat'] = lcat
Expand Down Expand Up @@ -177,6 +180,8 @@ def getlcat(node: SynTree, prel=None) -> Optional[str]: # noqa: C901
result = 'np'
elif pt == 'spec':
result = None
elif pt == dummy:
result = None
else:
print('Unknown att value (pt) encountered in:')
ET.dump(node)
Expand Down
52 changes: 52 additions & 0 deletions mwe_query/trymwes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from sastadev.alpinoparsing import parse
from lcat import expandnonheadwords
from sastadev.treebankfunctions import indextransform
from lxml import etree
from canonicalform import generatequeries, expandfull

debug = False

geenhaankraaien = ('0geen *haan zal naar iets kraaien',
['Daar kraait geen haan naar', 'Hier heeft geen haan naar gekraaid',
'geen haan kraaide daarnaar', 'geen haan kraaide ernaar dat hij niet kwam',
'geen haan kraaide er naar dat hij niet kwam',
'er is geen haan die daar naar kraait', ]
)

def select(mweutts, utt=None):
if utt is None:
result = mweutts
else:
result = (mweutts[0], [mweutts[1][utt]])
return result

def getparses(utterances):
uttparses = []
for utterance in utterances:
uttparse = parse(utterance)
uttparses.append(uttparse)
return uttparses

def trysomemwes():
mwe, utterances = select(geenhaankraaien)
mwequeries = generatequeries(mwe)
labeledmwequeries = (('MWEQ', mwequeries[0]), ('NMQ', mwequeries[1]), ('MLQ', mwequeries[2]))
uttparses = getparses(utterances)
for utterance, uttparse in zip(utterances, uttparses):
print(f'{utterance}:')
expandeduttparse = expandfull(uttparse)
if debug:
etree.dump(expandeduttparse)
for label, mwequery in labeledmwequeries:
results = expandeduttparse.xpath(mwequery)
if debug:
print('Found hits:')
for result in results:
etree.dump(result)
print(f'{label}: {len(results)}')




if __name__ == '__main__':
trysomemwes()

0 comments on commit 8819941

Please sign in to comment.