upgrade_translations.py

#!/usr/bin/python2
# -*- coding: utf-8 -*-

# Copyright (C) 2019 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Upgrades GTFS from Google translations extension [1] to GTFS-Translations [2].

[1] http://developers.google.com/transit/gtfs/reference/gtfs-extensions#translationstxt
[2] http://bit.ly/gtfs-translations

Usage.

Upgrade translations of a feed unpacked to `my-feed` directory and store them
to `my-feed_updated`:

  $ upgrade_translations.py my-feed

Specify output directory name explicitly:

  $ upgrade_translations.py my-feed-old my-feed-new

Sample feed.

  feed_info.txt:
  feed_publisher_name,feed_publisher_url,feed_lang
  Narnia,http://en.wikipedia.org/wiki/Narnia,en

  stops.txt:
  stop_id,stop_name,stop_lat,stop_lon
  stop1,Palace,10,11

  trips.txt:
  route_id,service_id,trip_id,trip_headsign
  sledge,service1,trip1,To Palace

Translations in Google extension format.

  translations.txt:
  trans_id,lang,translation
  http://en.wikipedia.org/wiki/Narnia,en,http://en.wikipedia.org/wiki/Narnia
  http://en.wikipedia.org/wiki/Narnia,es,http://es.wikipedia.org/wiki/Narnia
  Palace,en,Palace
  Palace,es,Palacio
  To Palace,en,To Palace
  To Palace,es,Palacio

Translations in GTFS-Translations format.

  translations.txt:
  table_name,field_name,language,translation,record_id,record_sub_id,field_value
  feed_info,feed_publisher_url,es,http://es.wikipedia.org/wiki/Narnia,,,
  stops,stop_name,es,Palacio,stop1,,
  trips,trip_headsign,es,Palacio,,,To Palace
"""

from __future__ import print_function

import csv
import os
import os.path
import shutil
import sys

# GTFS-Translations defines record_id and record_sub_id used for referencing a
# row in a GTFS table that requires translation.
RECORD_ID_MAP = {
    'agency': ('agency_id', None),
    'stops': ('stop_id', None),
    'routes': ('route_id', None),
    'trips': ('trip_id', None),
    'stop_times': ('trip_id', 'stop_sequence'),
    'feed_info': (None, None),
    'calendar': ('service_id', None),
    'calendar_dates': ('service_id', 'date'),
    'fare_attributes': ('fare_id', None),
    'fare_rules': ('fare_id', 'route_id'),
    'shapes': ('shape_id', 'shape_pt_sequence'),
    'frequencies': ('trip_id', 'start_time'),
    'transfers': ('from_stop_id', 'to_stop_id'),
    'pathways': ('pathway_id', None),
    'levels': ('level_id', None),
}

# File translations.txt in GTFS-Translations has the following fields.
NEW_TRANSLATIONS_FIELDS = [
    'table_name',
    'field_name',
    'language',
    'translation',
    'record_id',
    'record_sub_id',
    'field_value',
]

# Fields whose names end with the following suffixes are translated according
# to Google translations extension.
TRANSLATABLE_FIELD_NAME_SUFFIXES = [
    '_name',
    '_desc',
    '_headsign',
    '_url',
    '_text',
    '_abbreviation',
    # Handle pathway fields "signposted_as", "reversed_signposted_as"
    # and "instructions".
    'signposted_as',
    'instructions',
]


class RecordIdHelper(object):
    """Helper object to find record_id and record_sub_id based on GTFS table
    name and its fields.
    """
    def __init__(self, table_name, field_names):
        id_and_sub_id = RECORD_ID_MAP.get(table_name)
        if id_and_sub_id is None:
            # Use the first field name that ends with _id as record_id.
            first_id = self._find_first_id(field_names)
            if first_id:
                id_and_sub_id = (first_id, None)
            else:
                id_and_sub_id = (None, None)
        self.id_and_sub_id = id_and_sub_id

    def get_record_id(self, row):
        if self.id_and_sub_id[0]:
            return row.get(self.id_and_sub_id[0])
        return None

    def get_record_sub_id(self, row):
        if self.id_and_sub_id[1]:
            return row.get(self.id_and_sub_id[1])
        return None

    def describe_ids(self):
        return 'record_id = "%s", record_sub_id = "%s"' % (
            self.id_and_sub_id[0] or '',
            self.id_and_sub_id[1] or '')

    @staticmethod
    def _find_first_id(field_names):
        for field_name in field_names:
            if field_name.endswith('_id'):
                return field_name
        return None


def read_first_available_value(filename, field_name):
    """Reads the first assigned value of the given field in the CSV table.
    """
    if not os.path.exists(filename):
        return None
    with open(filename, 'rb') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            value = row.get(field_name)
            if value:
                return value
    return None


def is_translatable_field(field):
    for suffix in TRANSLATABLE_FIELD_NAME_SUFFIXES:
        if field.endswith(suffix):
            return True
    return False


def any_translatable_field(fields):
    return any(is_translatable_field(field) for field in fields)


class OldTranslations(object):
    """Reads all old translations and keeps them for further usage.
    """
    def __init__(self, src_dir):
        self.src_dir = src_dir
        self._find_feed_language()
        self._read_translations()
        self._find_context_dependent_names()

    def _find_feed_language(self):
        """Find feed language based specified feed_info.txt or agency.txt.
        """
        self.feed_language = (
            read_first_available_value(
                os.path.join(self.src_dir, 'feed_info.txt'), 'feed_lang') or
            read_first_available_value(
                os.path.join(self.src_dir, 'agency.txt'), 'agency_lang'))
        if not self.feed_language:
            raise Exception(
                'Cannot find feed language in feed_info.txt and agency.txt')
        print('\tfeed language: %s' % self.feed_language)

    def _read_translations(self):
        """Read from the old translations.txt.
        """
        print('Reading original translations')
        self.translations_map = {}
        n_translations = 0
        with open(os.path.join(self.src_dir, 'translations.txt'),
                  'rb') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                self.translations_map.setdefault(
                    row['trans_id'], {})[row['lang']] = row['translation']
                n_translations += 1
        print('\ttotal original translations: %s' % n_translations)

    def _find_context_dependent_names(self):
        """Finds texts whose translation depends on context.

        Example.
          Here the word "Palacio" is translated from Spanish to English in
          multiple ways. Feed language is es (Spanish).

          trans_id,lang,translation
          stop-name-1,es,Palacio
          stop-name-1,en,Palace
          headsign-1,es,Palacio
          headsign-1,en,To Palace
        """
        n_occurences_of_original = {}
        for trans_id, translations in self.translations_map.items():
            try:
                original_name = translations[self.feed_language]
            except KeyError:
                raise Exception(
                    'No translation in feed language for %s, available: %s' %
                    (trans_id, translations))
            n_occurences_of_original[original_name] = (
                n_occurences_of_original.get(original_name, 0) +
                1)

        self.context_dependent_names = set(
            name
            for name, occur in n_occurences_of_original.items()
            if occur > 1)
        print('Total context-dependent translations: %d' %
              len(self.context_dependent_names))


class TranslationsConverter(object):
    """Converts translations from the old to the new format.
    """
    def __init__(self, src_dir):
        self.src_dir = src_dir
        self.old_translations = OldTranslations(src_dir)

    def convert_translations(self, dest_dir):
        """
        Converts translations to the new format and stores at dest_dir.
        """
        if not os.path.isdir(dest_dir):
            os.makedirs(dest_dir)
        total_translation_rows = 0
        with open(os.path.join(dest_dir, 'translations.txt'),
                  'w+b') as out_file:
            writer = csv.DictWriter(
                out_file, fieldnames=NEW_TRANSLATIONS_FIELDS)
            writer.writeheader()
            for filename in sorted(os.listdir(self.src_dir)):
                if not (filename.endswith('.txt') and
                        os.path.isfile(os.path.join(self.src_dir, filename))):
                    print('Skipping %s' % filename)
                    continue
                table_name = filename[:-len('.txt')]
                if table_name == 'translations':
                    continue
                total_translation_rows += self._translate_table(
                    dest_dir, table_name, writer)
        print('Total translation rows: %s' % total_translation_rows)

    def _translate_table(self, dest_dir, table_name, translations_writer):
        """
        Converts translations to the new format for a single table.
        """
        in_filename = os.path.join(self.src_dir, '%s.txt' % table_name)
        if not os.path.exists(in_filename):
            raise Exception('No %s' % table_name)

        out_filename = os.path.join(dest_dir, '%s.txt' % table_name)
        with open(in_filename, 'rb') as in_file:
            reader = csv.DictReader(in_file)
            if not reader.fieldnames or not any_translatable_field(
                    reader.fieldnames):
                print('Copying %s with no translatable columns' % table_name)
                shutil.copy(in_filename, out_filename)
                return 0
            table_translator = TableTranslator(
                table_name, reader.fieldnames, self.old_translations,
                translations_writer)
            with open(out_filename, 'w+b') as out_file:
                writer = csv.DictWriter(out_file, fieldnames=reader.fieldnames)
                writer.writeheader()
                for row in reader:
                    writer.writerow(table_translator.translate_row(row))

            table_translator.write_for_field_values()
            print('\ttranslation rows: %s' %
                  table_translator.total_translation_rows)
            return table_translator.total_translation_rows


class TableTranslator(object):
    """Translates a given GTFS table.
    """
    def __init__(self, table_name, field_names, old_translations,
                 translations_writer):
        self.table_name = table_name
        self.old_translations = old_translations
        self.translations_writer = translations_writer
        self.record_id_helper = RecordIdHelper(table_name, field_names)
        self.total_translation_rows = 0
        # stop_times.txt and trips.txt usually have a lot of repeated
        # headsigns, so it is better to use field_value than record_id
        # and record_sub_id.  However, we will fallback to
        # record_id+sub_id if the translation is context-dependent,
        # e.g., the same trip_headsign is translated differently for
        # different trips.
        self.table_uses_record_id = table_name not in ('stop_times', 'trips')
        self.translations_for_values = {}

        print('Translating %s by %s' % (
            table_name,
            self.record_id_helper.describe_ids()
            if self.table_uses_record_id
            else 'field_name'))

    def translate_row(self, row):
        table_name = self.table_name
        feed_language = self.old_translations.feed_language
        translations_map = self.old_translations.translations_map
        context_dependent_names = self.old_translations.context_dependent_names
        out_row = row
        for field_name, field_value in row.items():
            if not is_translatable_field(field_name):
                continue
            field_translations = translations_map.get(field_value)
            if not field_translations:
                continue
            value_in_feed_lang = field_translations[feed_language]
            out_row[field_name] = value_in_feed_lang
            # If translation depends on the context, then always use record_id.
            use_record_id = (
                self.table_uses_record_id or
                value_in_feed_lang in context_dependent_names)
            record_id = self.record_id_helper.get_record_id(row)
            record_sub_id = self.record_id_helper.get_record_sub_id(row)
            for language, translation in field_translations.items():
                if language == feed_language:
                    continue
                if use_record_id:
                    self._write_translation_row({
                        'table_name': table_name,
                        'field_name': field_name,
                        'language': language,
                        'translation': translation,
                        'record_id': record_id,
                        'record_sub_id': record_sub_id,
                    })
                else:
                    self.translations_for_values[(
                        field_name,
                        language,
                        value_in_feed_lang)] = translation
        return out_row

    def write_for_field_values(self):
        for ((field_name, language, field_value),
             translation) in self.translations_for_values.items():
            self._write_translation_row({
                'table_name': self.table_name,
                'field_name': field_name,
                'language': language,
                'translation': translation,
                'field_value': field_value,
            })

    def _write_translation_row(self, row):
        self.translations_writer.writerow(row)
        self.total_translation_rows += 1


def main():
    if len(sys.argv) < 2:
        print('usage: upgrade_translations.py [SRC GTFS DIR] [DEST GTFS DIR]',
              file=sys.stderr)
        sys.exit(1)

    src_dir = os.path.normpath(sys.argv[1])
    if len(sys.argv) >= 3:
        dest_dir = sys.argv[2]
    else:
        dest_dir = '%s_upgraded' % src_dir

    print('Upgrading translations')
    print('\tsource directory: %s' % src_dir)
    print('\tdestination directory: %s' % dest_dir)

    TranslationsConverter(src_dir).convert_translations(dest_dir)
    print('Done!')


if __name__ == '__main__':
    main()