From 9f0dfe9d7bc5e34eb5e6911ef814a5bafdf7b6fe Mon Sep 17 00:00:00 2001 From: Ben Bonfil Date: Wed, 13 Dec 2023 09:30:11 +0100 Subject: [PATCH 1/3] improvements for R-pronouns and relativisation (#10) Co-authored-by: Jan Odijk --- mwe_query/canonicalform.py | 23 +++++++++++++---- mwe_query/lcat.py | 7 ++++- mwe_query/trymwes.py | 52 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 6 deletions(-) create mode 100644 mwe_query/trymwes.py diff --git a/mwe_query/canonicalform.py b/mwe_query/canonicalform.py index c9e771c..3612cea 100644 --- a/mwe_query/canonicalform.py +++ b/mwe_query/canonicalform.py @@ -386,7 +386,14 @@ def mknewnode(stree, mwetop, atts, annotations): newnode.attrib['maxnodecount'] = f'{len(stree)}' return newnode - +def expandnonheadwordnode(nonheadwordnode, phrasenodeproperties): + phraserel = gav(nonheadwordnode, 'rel') + newnonheadwordnode = copy.copy(nonheadwordnode) + newnonheadwordnode.attrib['rel'] = 'hd' + phrasenode = ET.Element('node', attrib=phrasenodeproperties) + phrasenode.attrib['rel'] = phraserel + phrasenode.append(newnonheadwordnode) + return phrasenode def zullenheadclause(stree: SynTree) -> bool: if stree.tag == 'node': cat = gav(stree, 'cat') @@ -1016,9 +1023,10 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: Rpronounobj1node = copy.copy(obj1node) Rpronounobj1node.attrib['lemma'] = 'er|hier|daar|waar|ergens|nergens|overal' Rpronounobj1node.attrib['pt'] = 'vnw' + newphrase = expandnonheadwordnode(Rpronounobj1node, {}) for child in newppnode2: newppnode2.remove(child) - newppnode2.append(Rpronounobj1node) + newppnode2.append(newphrase) newppnode2.append(newvz2) # pp with R-pronoun object which has been replaced by a full NO with a dummymod @@ -1047,11 +1055,15 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: pppronadvvcnode.append(pronadvnode1) pppronadvvcnode.append(newvcnode) + # pp's with a pronominal adverb. e.g. daarnaar pprel = gav(ppnode, 'rel') pronadvnode = getpronadv(vzlemma, pprel) + pronadvppnode = expandnonheadwordnode(pronadvnode, {'cat': 'pp', 'rel': pprel}) + pronadvnode.attrib['rel'] = 'hd' + pronadvppnode.append(pronadvnode) alternativesnode = mkalternativesnode([[ppnode], [newppnode2], [newppnode3], [ - pppobj1vcnode], [pppronadvvcnode], [pronadvnode]]) + pppobj1vcnode], [pppronadvvcnode], [pronadvppnode]]) parent.append(alternativesnode) vblgennpnodeids = newstree.xpath( @@ -1404,8 +1416,9 @@ def relpronsubst(stree: SynTree) -> SynTree: def expandfull(stree: SynTree) -> SynTree: # possibly add getlcat stree1 = relpronsubst(stree) - stree2 = indextransform(stree1) - return stree2 + stree2 = expandnonheadwords(stree1) + stree3 = indextransform(stree2) + return stree3 def gettopnode(stree): diff --git a/mwe_query/lcat.py b/mwe_query/lcat.py index 82ccf4c..76117f5 100644 --- a/mwe_query/lcat.py +++ b/mwe_query/lcat.py @@ -8,6 +8,8 @@ import copy import lxml.etree as ET +dummy = 'dummy' + def expandnonheadwords(stree: SynTree) -> SynTree: # it is presupposed that the input stree is not None @@ -47,7 +49,8 @@ def getlcatatt(node: SynTree) -> str: def mkphrase(child: SynTree) -> SynTree: newnode = ET.Element('node') - newnode.attrib['id'] = child.attrib['id'] + 'a' + if 'íd' in child.attrib: + newnode.attrib['id'] = child.attrib['id'] + 'a' lcat = getlcatatt(child) if lcat in validcats: newnode.attrib['cat'] = lcat @@ -176,6 +179,8 @@ def getlcat(node: SynTree, prel=None) -> str: # noqa: C901 result = 'np' elif pt == 'spec': result = None + elif pt == dummy: + result = None else: print('Unknown att value (pt) encountered in:') ET.dump(node) diff --git a/mwe_query/trymwes.py b/mwe_query/trymwes.py new file mode 100644 index 0000000..3962fc2 --- /dev/null +++ b/mwe_query/trymwes.py @@ -0,0 +1,52 @@ +from sastadev.alpinoparsing import parse +from lcat import expandnonheadwords +from sastadev.treebankfunctions import indextransform +from lxml import etree +from canonicalform import generatequeries, expandfull + +debug = False + +geenhaankraaien = ('0geen *haan zal naar iets kraaien', + ['Daar kraait geen haan naar', 'Hier heeft geen haan naar gekraaid', + 'geen haan kraaide daarnaar', 'geen haan kraaide ernaar dat hij niet kwam', + 'geen haan kraaide er naar dat hij niet kwam', + 'er is geen haan die daar naar kraait', ] + ) + +def select(mweutts, utt=None): + if utt is None: + result = mweutts + else: + result = (mweutts[0], [mweutts[1][utt]]) + return result + +def getparses(utterances): + uttparses = [] + for utterance in utterances: + uttparse = parse(utterance) + uttparses.append(uttparse) + return uttparses + +def trysomemwes(): + mwe, utterances = select(geenhaankraaien) + mwequeries = generatequeries(mwe) + labeledmwequeries = (('MWEQ', mwequeries[0]), ('NMQ', mwequeries[1]), ('MLQ', mwequeries[2])) + uttparses = getparses(utterances) + for utterance, uttparse in zip(utterances, uttparses): + print(f'{utterance}:') + expandeduttparse = expandfull(uttparse) + if debug: + etree.dump(expandeduttparse) + for label, mwequery in labeledmwequeries: + results = expandeduttparse.xpath(mwequery) + if debug: + print('Found hits:') + for result in results: + etree.dump(result) + print(f'{label}: {len(results)}') + + + + +if __name__ == '__main__': + trysomemwes() \ No newline at end of file From 74e51f41e517a26f92d0744ff0e3d8e6cef04c2c Mon Sep 17 00:00:00 2001 From: Ben Bonfil Date: Wed, 20 Dec 2023 15:08:00 +0100 Subject: [PATCH 2/3] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a63644..c5382dc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ +# MWE Query +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10410636.svg)](https://doi.org/10.5281/zenodo.10410636) +[![PyPI version](https://badge.fury.io/py/mwe-query.svg)](https://badge.fury.io/py/mwe-query) [![Actions Status](https://github.com/UUDigitalHumanitiesLab/mwe-query/workflows/Tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/mwe-query/actions) -# MWE Query ## Run Locally From 8d89be86f1687d37cbc5af76a2427132696caaec Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 20 Dec 2023 15:45:41 +0100 Subject: [PATCH 3/3] added CITATION.cff --- CITATION.cff | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..41e9526 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,29 @@ +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + +cff-version: 1.2.0 +title: MWE Query +message: >- + If you use this software, please cite it using the + metadata from this file. +type: software +authors: + - given-names: Jan + family-names: Odijk + affiliation: Utrecht University + - given-names: Martin + family-names: Kroon + affiliation: Utrecht University + orcid: 'https://orcid.org/0000-0003-3059-6872' + - name: >- + Research Software Lab, Centre for Digital Humanities, + Utrecht University + website: >- + https://cdh.uu.nl/centre-for-digital-humanities/research-software-lab/ + city: Utrecht + country: NL +identifiers: + - type: doi + value: 10.5281/zenodo.10410636 +repository-code: 'https://github.com/UUDigitalHumanitieslab/mwe-query' +license: BSD-3-Clause