Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added script for updating messages and basic usage help for other message scripts #1000

Open
wants to merge 3 commits into
base: clarin-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 25 additions & 31 deletions utilities/project_helpers/scripts/check-message-translations.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,41 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

## USAGE EXAMPLE: python check_messsage_translations.sh cs

import sys
import argparse
import os
import codecs
import re

from check_message_lib import find_language_file_name
from check_message_lib import find_language_file_name, get_js_keys, get_xml_keys

arg_parser = argparse.ArgumentParser(description='Compare the XML and JS message keys for two languages.')
arg_parser.add_argument('-lang1', required=True, help='First language as a 2-letter code')
arg_parser.add_argument('-lang2', default='en', help='Second language as a 2-letter code (defaults to "en")')
arguments = arg_parser.parse_args()
language1 = arguments.lang1
language2 = arguments.lang2

script_directory = os.path.dirname(os.path.realpath(__file__))
os.chdir(script_directory)

language1 = sys.argv[1]
language2 = sys.argv[2] if len(sys.argv) > 2 else 'en'

dspace_script = 'dspace-l10n-check.py'
def compare_keys(file_name1, file_name2, keys_function):
print('\n\nComparing {} and {}:'.format(file_name1, file_name2))
keys1 = keys_function(file_name1)
keys2 = keys_function(file_name2)
report_delta(file_name1, file_name2, keys2-keys1)
report_delta(file_name2, file_name1, keys1-keys2)

def report_delta(file_name1, file_name2, keys):
if (len(keys) == 0):
print('\n Every key in {} is also in {}.'.format(file_name2, file_name1))
else:
print('\n Present in ' + file_name2 + ' but missing in ' + file_name1 + ':')
for key in keys:
print(' ' + key)

xml_file_name1 = find_language_file_name(language1, 'xml')
xml_file_name2 = find_language_file_name(language2, 'xml')
os.system('python ' + dspace_script + ' ' + xml_file_name1 + ' ' + xml_file_name2)

js_key_regexp = r'^\s*["\']([\w-]+?)["\']\s*:'
def find_js_keys(js_file_name):
js_file = codecs.open(js_file_name, 'r', 'UTF-8')
keys = set()
for line in js_file:
match = re.search(js_key_regexp, line.strip(), re.U)
if (match):
keys.add(match.group(1))
return keys
compare_keys(xml_file_name1, xml_file_name2, get_xml_keys)

js_file_name1 = find_language_file_name(language1, 'js')
js_keys1 = find_js_keys(js_file_name1)
js_file_name2 = find_language_file_name(language2, 'js')
js_keys2 = find_js_keys(js_file_name2)

print '\nPresent in ' + js_file_name2 + ' but missing in ' + js_file_name1 + ':'
for key in (js_keys2 - js_keys1):
print key

print '\nPresent in ' + js_file_name1 + ' but missing in ' + js_file_name2 + ':'
for key in (js_keys1 - js_keys2):
print key

compare_keys(js_file_name1, js_file_name2, get_js_keys)
24 changes: 16 additions & 8 deletions utilities/project_helpers/scripts/check-message-usages.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,39 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import argparse
import subprocess
import codecs
import os
import re
import xml.etree.ElementTree as xml

from check_message_lib import find_language_file_name, root_directory
from check_message_lib import find_language_file_name, ROOT_DIRECTORY

language = sys.argv[1]
arg_parser = argparse.ArgumentParser(description='Check for usage of XML and JS message keys in code.')
arg_parser.add_argument('-lang', required=True, help='Language (as a 2-letter code) of the messages file')
arguments = arg_parser.parse_args()
language = arguments.lang

script_directory = os.path.dirname(os.path.realpath(__file__))
os.chdir(script_directory)

LINE_REGEXP = r'^(.+?):(.*)$'

line_regexp = r'^(.+?):(.*)$'

def find_xml_prefixes_and_files():

prefixes = ['xmlui', 'homepage', 'input_forms', 'org.dspace', 'PiwikStatisticsTransformer', 'UFAL.firstpage']
grep_command = 'grep -R -P "[>\'\\"](' + '|'.join(prefixes) + ')\\." --include=*.java --include=*.xsl --include=*.xmap --include=*.xslt --include=input-forms.xml --exclude-dir=*/target/* *'
prefix_regexp = "[>'\"]((?:" + "|".join(prefixes) + ")\..+?)[<'\"]"

os.chdir(root_directory)
os.chdir(ROOT_DIRECTORY)
with open(os.devnull, 'w') as devnull:
output = subprocess.check_output(grep_command, shell=True, stderr=devnull)
output_lines = output.strip().split('\n')
message_prefixes = set()
for grep_line in output_lines:
line_match = re.search(line_regexp, grep_line, re.U)
line_match = re.search(LINE_REGEXP, grep_line, re.U)
(file_name, line) = line_match.groups()
match_tuples = re.findall(prefix_regexp, line, re.U)
for match_tuple in match_tuples:
Expand Down Expand Up @@ -66,12 +74,12 @@ def add_js_results(language, results):
key = message_match.group(1)
result = {'type':'js', 'match':'no', 'key':key, 'file_name':None, 'prefix':None}
grep_command = 'grep -R -P "(\\\\$|jQuery)\\.i18n\._\\([\'\\"]' + key + '[\'\\"][),]" --include=*.js --include=*.html --exclude-dir=*/target/* *'
os.chdir(root_directory)
os.chdir(ROOT_DIRECTORY)
try:
with open(os.devnull, 'w') as devnull:
output = subprocess.check_output(grep_command, shell=True, stderr=devnull)
output_lines = output.strip().split('\n')
line_match = re.search(line_regexp, output_lines[0], re.U)
line_match = re.search(LINE_REGEXP, output_lines[0], re.U)
(file_name, line) = line_match.groups()
result['match'] = 'full'
result['file_name'] = file_name
Expand Down
42 changes: 30 additions & 12 deletions utilities/project_helpers/scripts/check_message_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,54 @@
# -*- coding: utf-8 -*-

import os
import re
import codecs
import xml.etree.ElementTree as etree

script_directory = os.path.dirname(os.path.realpath(__file__))
root_directory = script_directory + '/../../..'
xml_i18n_directory = root_directory + '/dspace/modules/xmlui/src/main/webapp/i18n'
js_i18n_directory = root_directory + '/dspace-xmlui/src/main/webapp/themes/UFAL/lib/js/messages'
xml_en_joint_file_name = '/tmp/messages-en.xml'
SCRIPT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
ROOT_DIRECTORY = SCRIPT_DIRECTORY + '/../../..'
XML_I18N_DIRECTORY = ROOT_DIRECTORY + '/dspace/modules/xmlui/src/main/webapp/i18n'
JS_I18N_DIRECTORY = ROOT_DIRECTORY + '/dspace-xmlui/src/main/webapp/themes/UFAL/lib/js/messages'
XML_EN_JOINT_FILE_NAME = '/tmp/messages-en.xml'
JS_KEY_REGEXP = r'^\s*["\']([\w-]+?)["\']\s*:'

def find_language_file_name(language, kind):
if (kind == 'xml'):
if (language != 'en'):
file_name = xml_i18n_directory + '/messages_' + language + '.xml'
file_name = XML_I18N_DIRECTORY + '/messages_' + language + '.xml'
else:
file_name = xml_en_joint_file_name
file_name = XML_EN_JOINT_FILE_NAME
_create_xml_en_joint_file()
elif (kind == 'js'):
if (language != 'en'):
file_name = js_i18n_directory + '/messages_' + language + '.js'
file_name = JS_I18N_DIRECTORY + '/messages_' + language + '.js'
else:
file_name = js_i18n_directory + '/messages.js'
file_name = JS_I18N_DIRECTORY + '/messages.js'
return os.path.abspath(file_name)

def get_xml_keys(messages_file_name):
root = etree.parse(messages_file_name).getroot()
return {message.get('key') for message in root}

def get_js_keys(js_file_name):
js_file = codecs.open(js_file_name, 'r', 'UTF-8')
keys = set()
for line in js_file:
match = re.search(JS_KEY_REGEXP, line.strip(), re.U)
if (match):
keys.add(match.group(1))
return keys

## Merge together all messages.xml into one temporary messages-en.xml.
## Avoids xml parsing to prevent namespace complications.
def _create_xml_en_joint_file():
en_file_names = set()
for (dpath, dnames, fnames) in os.walk(root_directory):
for (dpath, dnames, fnames) in os.walk(ROOT_DIRECTORY):
for fname in [os.path.join(dpath, fname) for fname in fnames]:
if ('/target/' not in fname and fname.endswith('/messages.xml')):
en_file_names.add(os.path.abspath(fname))
print 'Constructing temporary /tmp/messages_en.xml from all messages.xml:\n ' + '\n '.join(en_file_names) + '\n'
en_joint_file = codecs.open(xml_en_joint_file_name, 'w', 'UTF-8')
print('\nConstructing temporary joint xml ' + XML_EN_JOINT_FILE_NAME + ' from all English messages.xml:\n ' + '\n '.join(en_file_names))
en_joint_file = codecs.open(XML_EN_JOINT_FILE_NAME, 'w', 'UTF-8')
for (index, en_file_name) in enumerate(en_file_names):
en_file = codecs.open(en_file_name, 'r', 'UTF-8')
if (index == 0):
Expand All @@ -54,3 +70,5 @@ def _create_xml_en_joint_file():
en_file.close()
en_joint_file.write('</catalogue>\n')
en_joint_file.close()


Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

import argparse
import os
import lxml.etree as lxml
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll be a bit nitpicky here, but do we need lxml?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha, probably not. I'll try without.


from check_message_lib import find_language_file_name, get_xml_keys

arg_parser = argparse.ArgumentParser(description="Add English XML messages missing in the language's messages, marked with @TODO=TRANSLATE.")
arg_parser.add_argument('-lang', required=True, help='Language (as a 2-letter code) of the messages file')
arguments = arg_parser.parse_args()
language = arguments.lang

script_directory = os.path.dirname(os.path.realpath(__file__))
os.chdir(script_directory)

english_file_name = find_language_file_name('en', 'xml')
english_keys = get_xml_keys(english_file_name)
other_file_name = find_language_file_name(language, 'xml')
other_keys = get_xml_keys(other_file_name)

if (other_keys == english_keys):
print('\nThe sets of message keys in {} and {} are already the same.'.format(english_file_name, other_file_name))
else:
current_map = {}
parser = lxml.XMLParser(remove_blank_text=True)
other_tree = lxml.parse(other_file_name, parser)
other_root = other_tree.getroot()
for message in other_root:
if message.tag is lxml.Comment:
other_root.remove(message)
else:
key = message.get('key')
current_map[key] = message
other_root.remove(message)
english_tree = lxml.parse(english_file_name)
english_root = english_tree.getroot()
for message in english_root:
if (message.tag != lxml.Comment):
for element in message.xpath('descendant-or-self::*'):
element.tag = element.tag[element.tag.index('}')+1:]
Comment on lines +41 to +42
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do these lines do? Seems that you are looking for } in the tag name?

moreover

sources/clarin-dspace$ (gh-merge/1000)> python utilities/project_helpers/scripts/update-message-xml-translations.py sl

Constructing temporary joint xml /tmp/messages-en.xml from all English messages.xml:
  /mnt/c/Users/ko_ok/sources/clarin-dspace/dspace-xmlui/src/main/webapp/i18n/messages.xml
  /mnt/c/Users/ko_ok/sources/clarin-dspace/dspace-xmlui/src/main/resources/aspects/SwordClient/i18n/messages.xml
  /mnt/c/Users/ko_ok/sources/clarin-dspace/dspace-xmlui/src/main/resources/aspects/Discovery/i18n/messages.xml
  /mnt/c/Users/ko_ok/sources/clarin-dspace/dspace-xmlui/src/main/resources/aspects/XMLWorkflow/i18n/messages.xml
Traceback (most recent call last):
  File "utilities/project_helpers/scripts/update-message-xml-translations.py", line 42, in <module>
    element.tag = element.tag[element.tag.index('}')+1:]
ValueError: substring not found

But I must say python utilities/project_helpers/scripts/update-message-xml-translations.py sl finished on one of four runs, which is even more confusing

Copy link
Collaborator Author

@cyplas cyplas Dec 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this is hacky, and, as your test runs indicate, not robust. Curly braces are used to wrap a namespace in tag names (e.g., "{http://apache.org/cocoon/i18n/2.1}message"). In constructing the temporary joint English messages.xml (which is in check_message_lib.py, already from before), I had to deal with the fact that for some reason, some of the English messages.xml files have a default namespace (xmlns attribute) and others don't. But it looks like the order in which I construct the joint xml isn't deterministic (oops, set instead of list!), and that the namespace in the joint xml is not guaranteed. If and when this joint xml doesn't have a default namespace, your ValueError must be triggered.

Ok, I can fix the joint xml construction, and maybe it's best to make sure it does not have a namespace (since messages_cs.xml and messages_sl.xml don't). And then these lines can and must be removed.

key = message.get('key')
if (key in other_keys):
other_root.append(current_map[key])
else:
message.tail = None
message.set('TODO', 'translate')
other_root.append(message)
other_tree.write(other_file_name, encoding='UTF-8', pretty_print=True)
print('\n{} has been updated to contain all and only the keys of {}.'.format(other_file_name, english_file_name))
print('')