From ac741a7498604c34c24af1f64027263c1b43a125 Mon Sep 17 00:00:00 2001 From: nleanba <25827850+nleanba@users.noreply.github.com> Date: Thu, 26 Sep 2024 16:03:29 +0200 Subject: [PATCH] Improved Normalization of Authority Names see #29 --- src/gg2rdf.ts | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/src/gg2rdf.ts b/src/gg2rdf.ts index a64ec6e..216a76e 100644 --- a/src/gg2rdf.ts +++ b/src/gg2rdf.ts @@ -563,10 +563,9 @@ export function gg2rdf( authority = normalizeAuthority(authority); if (baseAuthority) { // ensures the baseAuthority is not present twice - authority = authority.replaceAll( - new RegExp(`\\(?${baseAuthority}\\)?[,:;\\s]*`, "g"), - "", - ); + authority = authority + .replaceAll(baseAuthority, "@@@") + .replaceAll(/\(?@@@\)?[,:;\s]*/g, ""); } } if (baseAuthority && authority) { @@ -659,25 +658,23 @@ export function gg2rdf( /** for dwc:scientificNameAuthorship and dwc:authority */ function normalizeAuthority(a: string): string { if (!a) return ""; - let result = normalizeSpace(a).replace( - /\s*,?\s*(\(?[0-9]{4}\)?)\s*[a-z]*\s*:?(?:\s*[0-9]*\s*[a-z-]*\s*,?)*(\)?)\s*$/, - ", $1$2", - ).replaceAll( - /\s+and\s+/g, - " & ", - ).replaceAll( - /\s+et\s+([^a])/g, - " & $1", - ).replace( - /\)\)$/, - ")", - ).replace( - /^\(\(/, - "(", - ).replace( - /^\s*[,:;]+\s*/, - "", - ); + let result = normalizeSpace(a) + .replace( + /\s*,?\s*(\(?[0-9]{4}\)?)\s*[a-z]*\s*:?(?:\s*[0-9]*\s*[a-z-]*\s*,?)*(\)?)\s*$/, + ", $1$2", + ) + .replaceAll('"', "") + .replaceAll(/(?:\p{Uppercase_Letter}\.\s+)+(\w+)/ug, "$1") + .replaceAll(/\s+and\s+/g, " & ") + .replaceAll(/\s+et\s+([^a])/g, " & $1") + .replace(/\)\)$/, ")") + .replace(/^\(\(/, "(") + .replace(/^\s*[,:;]+\s*/, "") + .replace(/\s*[,:;]+\s*$/, ""); + if (result.indexOf("&") != result.lastIndexOf("&")) { + const split = result.split("&").map((s) => s.trim()); + result = split.slice(0, -1).join(", ") + " & " + split.at(-1); + } if (result.lastIndexOf("(") > result.lastIndexOf(")")) { result += ")"; // sometimes closing brace is missing }