diff --git a/demo.py b/demo.py index a383823..2984946 100644 --- a/demo.py +++ b/demo.py @@ -10,7 +10,7 @@ from tqdm import tqdm from dh_segment.io import PAGE -from dh_segment.network import LoadedModel +from dh_segment.inference import LoadedModel from dh_segment.post_processing import boxes_detection, binarization # To output results in PAGE XML format (http://www.primaresearch.org/schema/PAGE/gts/pagecontent/2013-07-15/) @@ -89,14 +89,17 @@ def format_quad_to_string(quad): cv2.polylines(original_img, [pred_page_coords[:, None, :]], True, (0, 0, 255), thickness=5) # Write corners points into a .txt file txt_coordinates += '{},{}\n'.format(filename, format_quad_to_string(pred_page_coords)) + + # Create page region and XML file + page_border = PAGE.Border(coords=PAGE.Point.cv2_to_point_list(pred_page_coords[:, None, :])) else: print('No box found in {}'.format(filename)) + page_border = PAGE.Border() + basename = os.path.basename(filename).split('.')[0] imsave(os.path.join(output_dir, '{}_boxes.jpg'.format(basename)), original_img) - # Create page region and XML file - page_border = PAGE.Border(coords=PAGE.Point.cv2_to_point_list(pred_page_coords[:, None, :])) - page_xml = PAGE.Page(filename, image_width=original_shape[1], image_height=original_shape[0], + page_xml = PAGE.Page(image_filename=filename, image_width=original_shape[1], image_height=original_shape[0], page_border=page_border) xml_filename = os.path.join(output_pagexml_dir, '{}.xml'.format(basename)) page_xml.write_to_file(xml_filename, creator_name='PageExtractor') diff --git a/demo/interactive_demo.ipynb b/demo/interactive_demo.ipynb new file mode 100644 index 0000000..a7c6df5 --- /dev/null +++ b/demo/interactive_demo.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interactive demo to load a trained model for page extraction and apply it to a randomly selected file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1. Get the annotated sample dataset, which already contains the folders images and labels. Unzip it into `demo/pages_sample`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! wget https://github.com/dhlab-epfl/dhSegment/releases/download/untagged-b55f9aa4fff5efd4b1b8/pages_sample.zip\n", + "! unzip pages_sample.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Download the provided model (download and unzip it in `demo/model`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! wget https://github.com/dhlab-epfl/dhSegment/releases/download/v0.2/model.zip\n", + "! unzip model.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Run the code step by step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import cv2\n", + "from glob import glob\n", + "import numpy as np\n", + "import random\n", + "import tensorflow as tf\n", + "from imageio import imread, imsave" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dh_segment.io import PAGE\n", + "from dh_segment.inference import LoadedModel\n", + "from dh_segment.post_processing import boxes_detection, binarization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def page_make_binary_mask(probs: np.ndarray, threshold: float=-1) -> np.ndarray:\n", + " \"\"\"\n", + " Computes the binary mask of the detected Page from the probabilities outputed by network\n", + " :param probs: array with values in range [0, 1]\n", + " :param threshold: threshold between [0 and 1], if negative Otsu's adaptive threshold will be used\n", + " :return: binary mask\n", + " \"\"\"\n", + "\n", + " mask = binarization.thresholding(probs, threshold)\n", + " mask = binarization.cleaning_binary(mask, kernel_size=5)\n", + " return mask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define input and output directories / files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_dir = 'page_model/export'\n", + "if not os.path.exists(model_dir):\n", + " model_dir = 'model/'\n", + "assert(os.path.exists(model_dir))\n", + "\n", + "input_files = glob(os.path.join('pages_sample', 'images/*'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_dir = './processed_images'\n", + "os.makedirs(output_dir, exist_ok=True)\n", + "# PAGE XML format output\n", + "output_pagexml_dir = os.path.join(output_dir, 'page_xml')\n", + "os.makedirs(output_pagexml_dir, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Start a tensorflow session" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "session = tf.InteractiveSession()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select a random image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_to_process = random.sample(input_files, 1)[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m = LoadedModel(model_dir, predict_mode='filename')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predict each pixel's label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For each image, predict each pixel's label\n", + "prediction_outputs = m.predict(file_to_process)\n", + "probs = prediction_outputs['probs'][0]\n", + "original_shape = prediction_outputs['original_shape']\n", + "\n", + "probs = probs[:, :, 1] # Take only class '1' (class 0 is the background, class 1 is the page)\n", + "probs = probs / np.max(probs) # Normalize to be in [0, 1]\n", + "\n", + "# Binarize the predictions\n", + "page_bin = page_make_binary_mask(probs)\n", + "\n", + "# Upscale to have full resolution image (cv2 uses (w,h) and not (h,w) for giving shapes)\n", + "bin_upscaled = cv2.resize(page_bin.astype(np.uint8, copy=False),\n", + " tuple(original_shape[::-1]), interpolation=cv2.INTER_NEAREST)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Show the probability map and binarized mask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,10))\n", + "plt.subplot(1,2,1)\n", + "plt.imshow(probs, cmap='gray')\n", + "plt.axis('off')\n", + "plt.title('Probability map')\n", + "plt.subplot(1,2,2)\n", + "plt.imshow(page_bin, cmap='gray')\n", + "plt.axis('off')\n", + "plt.title('Binary mask')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find quadrilateral enclosing the page" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pred_page_coords = boxes_detection.find_boxes(bin_upscaled.astype(np.uint8, copy=False),\n", + " mode='min_rectangle', n_max_boxes=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Draw page box on original image and export it. Add also box coordinates to the txt file\n", + "original_img = imread(file_to_process, pilmode='RGB')\n", + "if pred_page_coords is not None:\n", + " cv2.polylines(original_img, [pred_page_coords[:, None, :]], True, (0, 0, 255), thickness=5)\n", + "else:\n", + " print('No box found in {}'.format(filename))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,10))\n", + "plt.imshow(original_img)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Export image and create page region and XML file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "basename = os.path.basename(file_to_process).split('.')[0]\n", + "imsave(os.path.join(output_dir, '{}_boxes.jpg'.format(basename)), original_img)\n", + "\n", + "page_border = PAGE.Border(coords=PAGE.Point.cv2_to_point_list(pred_page_coords[:, None, :]))\n", + "page_xml = PAGE.Page(image_filename=file_to_process, image_width=original_shape[1], image_height=original_shape[0], page_border=page_border)\n", + "xml_filename = os.path.join(output_pagexml_dir, '{}.xml'.format(basename))\n", + "page_xml.write_to_file(xml_filename, creator_name='PageExtractor')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Have a look at the results in ``demo/processed_images``" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:dhsegment]", + "language": "python", + "name": "conda-env-dhsegment-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/dh_segment/estimator_fn.py b/dh_segment/estimator_fn.py index d25303b..37d92bc 100644 --- a/dh_segment/estimator_fn.py +++ b/dh_segment/estimator_fn.py @@ -2,7 +2,6 @@ from .utils import PredictionType, ModelParams, TrainingParams, \ class_to_label_image, multiclass_to_label_image import numpy as np -from .network.model import inference_resnet_v1_50, inference_vgg16, inference_u_net def model_fn(mode, features, labels, params): @@ -18,45 +17,23 @@ def model_fn(mode, features, labels, params): input_images = tf.pad(input_images, [[0, 0], [margin, margin], [margin, margin], [0, 0]], mode='SYMMETRIC', name='mirror_padding') - if model_params.pretrained_model_name == 'vgg16': - network_output = inference_vgg16(input_images, - model_params, - model_params.n_classes, - use_batch_norm=model_params.batch_norm, - weight_decay=model_params.weight_decay, - is_training=(mode == tf.estimator.ModeKeys.TRAIN) - ) - key_restore_model = 'vgg_16' + encoder_class = model_params.get_encoder() + encoder = encoder_class(**model_params.encoder_network_params) + decoder_class = model_params.get_decoder() + decoder = decoder_class(**model_params.decoder_network_params) - elif model_params.pretrained_model_name == 'resnet50': - network_output = inference_resnet_v1_50(input_images, - model_params, - model_params.n_classes, - use_batch_norm=model_params.batch_norm, - weight_decay=model_params.weight_decay, - is_training=(mode == tf.estimator.ModeKeys.TRAIN) - ) - key_restore_model = 'resnet_v1_50' - elif model_params.pretrained_model_name == 'unet': - network_output = inference_u_net(input_images, - model_params, - model_params.n_classes, - use_batch_norm=model_params.batch_norm, - weight_decay=model_params.weight_decay, - is_training=(mode == tf.estimator.ModeKeys.TRAIN) - ) - key_restore_model = None - else: - raise NotImplementedError + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + feature_maps = encoder(input_images, is_training=is_training) + network_output = decoder(feature_maps, num_classes=model_params.n_classes, is_training=is_training) if mode == tf.estimator.ModeKeys.TRAIN: - if key_restore_model is not None: + pretrained_file, pretrained_vars = encoder.pretrained_information() + if pretrained_file: # Pretrained weights as initialization - pretrained_restorer = tf.train.Saver(var_list=[v for v in tf.global_variables() - if key_restore_model in v.name]) + pretrained_restorer = tf.train.Saver(var_list=pretrained_vars) def init_fn(scaffold, session): - pretrained_restorer.restore(session, model_params.pretrained_model_file) + pretrained_restorer.restore(session, pretrained_file) else: init_fn = None else: @@ -92,8 +69,10 @@ def init_fn(scaffold, session): if prediction_type == PredictionType.CLASSIFICATION: onehot_labels = tf.one_hot(indices=labels, depth=model_params.n_classes) with tf.name_scope("loss"): - per_pixel_loss = tf.nn.softmax_cross_entropy_with_logits(logits=network_output, - labels=onehot_labels, name='per_pixel_loss') + #per_pixel_loss = tf.nn.softmax_cross_entropy_with_logits(logits=network_output, + # labels=onehot_labels, name='per_pixel_loss') + per_pixel_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=network_output, + labels=onehot_labels, name='per_pixel_loss') if training_params.focal_loss_gamma > 0.0: # Probability per pixel of getting the correct label probs_correct_label = tf.reduce_max(tf.multiply(prediction_probs, onehot_labels)) @@ -207,14 +186,20 @@ def _fn(_in): # ---------- if mode == tf.estimator.ModeKeys.EVAL: if prediction_type == PredictionType.CLASSIFICATION: - metrics = {'eval/accuracy': tf.metrics.accuracy(labels, predictions=prediction_labels)} + metrics = { + 'eval/accuracy': tf.metrics.accuracy(labels, predictions=prediction_labels), + 'eval/mIOU': tf.metrics.mean_iou(labels, prediction_labels, num_classes=model_params.n_classes,) + # weights=tf.cast(training_params.weights_evaluation_miou, tf.float32)) + } elif prediction_type == PredictionType.REGRESSION: metrics = {'eval/accuracy': tf.metrics.mean_squared_error(labels, predictions=prediction_labels)} elif prediction_type == PredictionType.MULTILABEL: metrics = {'eval/MSE': tf.metrics.mean_squared_error(tf.cast(labels, tf.float32), predictions=prediction_probs), 'eval/accuracy': tf.metrics.accuracy(tf.cast(labels, tf.bool), - predictions=tf.cast(prediction_labels, tf.bool)) + predictions=tf.cast(prediction_labels, tf.bool)), + 'eval/mIOU': tf.metrics.mean_iou(labels, prediction_labels, num_classes=model_params.n_classes) + # weights=training_params.weights_evaluation_miou) } else: metrics = None diff --git a/dh_segment/inference/loader.py b/dh_segment/inference/loader.py index be64bc7..4949673 100644 --- a/dh_segment/inference/loader.py +++ b/dh_segment/inference/loader.py @@ -9,6 +9,16 @@ class LoadedModel: + """ + Loads an exported dhSegment model + + :param model_base_dir: the model directory i.e. containing `saved_model.{pb|pbtxt}`. If not, it is assumed to \ + be a TF exporter directory, and the latest export directory will be automatically selected. + :param predict_mode: defines the input/output format of the prediction output (see `.predict()`) + :param num_parallel_predictions: limits the number of conccurent calls of `predict` to avoid Out-Of-Memory \ + issues if predicting on GPU + """ + def __init__(self, model_base_dir, predict_mode='filename', num_parallel_predictions=2): if os.path.exists(os.path.join(model_base_dir, 'saved_model.pbtxt')) or \ os.path.exists(os.path.join(model_base_dir, 'saved_model.pb')): @@ -52,6 +62,29 @@ def __init__(self, model_base_dir, predict_mode='filename', num_parallel_predict self.sema = Semaphore(num_parallel_predictions) def predict(self, input_tensor, prediction_key=None): + """ + Performs the prediction from the loaded model according to the prediction mode. \n + Prediction modes: + + +-----------------------------+-----------------------------------------------+--------------------------------------+---------------------------------------------------------------------------------------------------+ + | `prediction_mode` | `input_tensor` | Output prediction dictionnary | Comment | + +=============================+===============================================+======================================+===================================================================================================+ + | `filename` | Single filename string | `labels`, `probs`, `original_shape` | Loads the image, resizes it, and predicts | + +-----------------------------+-----------------------------------------------+--------------------------------------+---------------------------------------------------------------------------------------------------+ + | `filename_original_shape` | Single filename string | `labels`, `probs` | Loads the image, resizes it, predicts and scale the output to the original resolution of the file | + +-----------------------------+-----------------------------------------------+--------------------------------------+---------------------------------------------------------------------------------------------------+ + | `image` | Single input image [1,H,W,3] float32 (0..255) | `labels`, `probs`, `original_shape` | Resizes the image, and predicts | + +-----------------------------+-----------------------------------------------+--------------------------------------+---------------------------------------------------------------------------------------------------+ + | `image_original_shape` | Single input image [1,H,W,3] float32 (0..255) | `labels`, `probs` | Resizes the image, predicts, and scale the output to the original resolution of the input | + +-----------------------------+-----------------------------------------------+--------------------------------------+---------------------------------------------------------------------------------------------------+ + | `image_resized` | Single input image [1,H,W,3] float32 (0..255) | `labels`, `probs` | Predicts from the image input directly | + +-----------------------------+-----------------------------------------------+--------------------------------------+---------------------------------------------------------------------------------------------------+ + + :param input_tensor: a single input whose format should match the prediction mode + :param prediction_key: if not `None`, will returns the value of the corresponding key of the output dictionnary \ + instead of the full dictionnary + :return: the prediction output + """ with self.sema: if prediction_key: desired_output = self._output_dict[prediction_key] diff --git a/dh_segment/io/PAGE.py b/dh_segment/io/PAGE.py index d9d5a11..5214735 100644 --- a/dh_segment/io/PAGE.py +++ b/dh_segment/io/PAGE.py @@ -8,6 +8,7 @@ from uuid import uuid4 from shapely.geometry import Polygon from abc import ABC +import re # https://docs.python.org/3.5/library/xml.etree.elementtree.html#parsing-xml-with-namespaces _ns = {'p': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'} @@ -17,7 +18,7 @@ def _try_to_int(d: Optional[Union[str, int]])-> Optional[int]: - if isinstance(d, str): + if isinstance(d, (str, np.int32, np.int64)): return int(d) else: return d @@ -54,6 +55,9 @@ def list_from_xml(cls, etree_elem: ET.Element) -> List['Point']: if etree_elem is None: # print('warning, trying to construct list of points from None, defaulting to []') return [] + if etree_elem.attrib['points'] == "": + # print('warning, trying to construct list of points from empty string, defaulting to []') + return [] t = etree_elem.attrib['points'] result = [] for p in t.split(' '): @@ -99,12 +103,21 @@ def array_to_list(cls, array: np.ndarray) -> list: """ return [list(pt) for pt in array] + @classmethod + def array_to_point(cls, array: np.ndarray) -> list: + """Converts an `np.array` to a list of `Point` + + :param array: an array of coordinates. Must be of shape (N, 2) + :return: list of `Point` + """ + return cls.list_to_point(list(array)) + @classmethod def list_to_point(cls, list_coords: list) -> List['Point']: """Converts a list of coordinates to a list of `Point` :param list_coords: list of coordinates, shape (N, 2) - :return: list of Points + :return: list of `Point` """ return [cls(coord[1], coord[0]) for coord in list_coords if list_coords] @@ -160,12 +173,15 @@ class Region(BaseElement): :ivar id: identifier of the `Region` :ivar coords: coordinates of the `Region` + :ivar custom_attribute: Any custom attribute that may be linked with the region + (usually this is added in PAGEXML files, not in JSON files) """ tag = 'Region' - def __init__(self, id: str=None, coords: List[Point]=None): + def __init__(self, id: str=None, coords: List[Point]=None, custom_attribute: str=None): self.coords = coords if coords is not None else [] self.id = id + self.custom_attribute = custom_attribute if custom_attribute is not None else '' @classmethod def from_xml(cls, etree_element: ET.Element) -> dict: @@ -175,6 +191,7 @@ def from_xml(cls, etree_element: ET.Element) -> dict: :return: a dictionary with keys 'id' and 'coords' """ return {'id': etree_element.attrib.get('id'), + 'custom_attribute': etree_element.attrib.get('custom'), 'coords': Point.list_from_xml(etree_element.find('p:Coords', _ns))} def to_xml(self, name_element: str=None) -> ET.Element: @@ -185,6 +202,7 @@ def to_xml(self, name_element: str=None) -> ET.Element: """ et = ET.Element(name_element if name_element is not None else '') et.set('id', self.id if self.id is not None else '') + et.set('custom', self.custom_attribute if self.custom_attribute is not None else '') if not not self.coords: coords = ET.SubElement(et, 'Coords') coords.set('points', Point.list_point_to_string(self.coords)) @@ -209,6 +227,7 @@ def from_dict(cls, dictionary: dict) -> dict: :return: non serialized dictionary """ return {'id': dictionary.get('id'), + 'custom_attribute': dictionary.get('custom_attribute'), 'coords': Point.list_to_point(dictionary.get('coords')) } @@ -222,13 +241,14 @@ class TextLine(Region): :ivar text: `Text` class containing the transcription of the `TextLine` :ivar line_group_id: identifier of the line group the instance belongs to :ivar column_group_id: identifier of the column group the instance belongs to - + :ivar custom_attribute: Any custom attribute that may be linked with the region + (usually this is added in PAGEXML files, not in JSON files) """ tag = 'TextLine' def __init__(self, id: str = None, coords: List[Point] = None, baseline: List[Point] = None, text: Text = None, - line_group_id: str = None, column_group_id: str = None): - super().__init__(id=id if id is not None else str(uuid4()), coords=coords) + line_group_id: str = None, column_group_id: str = None, custom_attribute: str=None): + super().__init__(id=id if id is not None else str(uuid4()), coords=coords, custom_attribute=custom_attribute) self.baseline = baseline if baseline is not None else [] self.text = text if text is not None else Text() self.line_group_id = line_group_id if line_group_id is not None else '' @@ -321,13 +341,29 @@ class TextRegion(Region): :ivar coords: coordinates of the `TextRegion` :ivar text_equiv: the resulting text of the `Text` contained in the `TextLines` :ivar text_lines: a list of `TextLine` objects + :ivar region_type: the type of a TextRegion (can be any string). Example : header, paragraph, page-number... + :ivar custom_attribute: Any custom attribute that may be linked with the region + (usually this is added in PAGEXML files, not in JSON files) """ tag = 'TextRegion' - def __init__(self, id: str=None, coords: List[Point]=None, text_lines: List[TextLine]=None, text_equiv: str=''): - super().__init__(id=id, coords=coords) + def __init__(self, id: str=None, coords: List[Point]=None, text_lines: List[TextLine]=None, text_equiv: str='', + region_type: str=None, custom_attribute: str=None): + super().__init__(id=id, coords=coords, custom_attribute=custom_attribute) self.text_equiv = text_equiv if text_equiv is not None else '' self.text_lines = text_lines if text_lines is not None else [] + self.type = region_type if region_type is not None else '' + + def sort_text_lines(self, top_to_bottom: bool=True) -> None: + """ + Sorts ``TextLine`` from top to bottom according to their mean y coordinate (centroid) + + :param top_to_bottom: order lines from top to bottom of image, default=True + """ + if top_to_bottom: + self.text_lines.sort(key=lambda line: np.mean([c.y for c in line.coords])) + else: + raise NotImplementedError @classmethod def from_xml(cls, e: ET.Element) -> 'TextRegion': @@ -335,11 +371,14 @@ def from_xml(cls, e: ET.Element) -> 'TextRegion': return TextRegion( **super().from_xml(e), text_lines=[TextLine.from_xml(tl) for tl in e.findall('p:TextLine', _ns)], - text_equiv=_get_text_equiv(e) + text_equiv=_get_text_equiv(e), + region_type=e.attrib.get('type') ) def to_xml(self, name_element='TextRegion') -> ET.Element: text_et = super().to_xml(name_element=name_element) + if self.type is not None and self.type != '': + text_et.set('type', self.type) for tl in self.text_lines: text_et.append(tl.to_xml()) text_equiv = ET.SubElement(text_et, 'TextEquiv') @@ -355,7 +394,8 @@ def to_dict(self, non_serializable_keys: List[str]=list()): def from_dict(cls, dictionary: dict) -> 'TextRegion': return cls(**super().from_dict(dictionary), text_lines=[TextLine.from_dict(tl) for tl in dictionary.get('text_lines', list())], - text_equiv=dictionary.get('text_equiv') + text_equiv=dictionary.get('text_equiv'), + region_type=dictionary.get('region_type') ) @@ -403,7 +443,7 @@ def from_dict(cls, dictionary: dict) -> 'TableRegion': return cls(**super().from_dict(dictionary), rows=dictionary.get('rows'), columns=dictionary.get('columns'), - embeded_text=dictionary.get('embeded_text')) + embedded_text=dictionary.get('embedded_text')) class SeparatorRegion(Region): @@ -418,8 +458,8 @@ class SeparatorRegion(Region): tag = 'SeparatorRegion' - def __init__(self, id: str, coords: List[Point]=None): - super().__init__(id=id, coords=coords) + def __init__(self, id: str, coords: List[Point]=None, custom_attribute: str=None): + super().__init__(id=id, coords=coords, custom_attribute=custom_attribute) @classmethod def from_xml(cls, e: ET.Element) -> 'SeparatorRegion': @@ -547,7 +587,8 @@ def __init__(self, id: str = None, coords: List[Point] = None, segment_ids: List @classmethod def from_dict(cls, dictionary: dict) -> 'GroupSegment': - return cls(**super().from_dict(dictionary)) + return cls(**super().from_dict(dictionary), + segment_ids=dictionary.get('segment_ids')) class Page(BaseElement): @@ -573,7 +614,7 @@ class Page(BaseElement): def __init__(self, **kwargs): self.image_filename = kwargs.get('image_filename') - self.image_width = _try_to_int(kwargs.get('image_width')) + self.image_width = _try_to_int(kwargs.get('image_width')) # Needs to be int type (not np.int32/64) self.image_height = _try_to_int(kwargs.get('image_height')) self.text_regions = kwargs.get('text_regions', []) self.graphic_regions = kwargs.get('graphic_regions', []) @@ -635,6 +676,14 @@ def to_xml(self) -> ET.Element: # page_et.append(self.metadata.to_xml()) return page_et + def to_json(self) -> dict: + self_dict = vars(self) + + serializable_keys = ['image_filename', 'image_height', 'image_width'] + json_dict = json_serialize(self_dict, [k for k in self_dict.keys() if k not in serializable_keys]) + + return json_dict + def write_to_file(self, filename: str, creator_name: str='dhSegment', comments: str='') -> None: """ Export Page object to json or page-xml format. Will assume the format based on the extension of the filename, @@ -653,17 +702,11 @@ def _write_xml(): root.append(self.to_xml()) for k, v in _attribs.items(): root.attrib[k] = v - ET.ElementTree(element=root).write(filename) + ET.ElementTree(element=root).write(filename, encoding='utf-8') def _write_json(): - self_dict = vars(self) - - # json_dict = dict() - serializable_keys = ['image_filename', 'image_height', 'image_width'] - json_dict = json_serialize(self_dict, [k for k in self_dict.keys() if k not in serializable_keys]) - with open(filename, 'w', encoding='utf8') as file: - json.dump(json_dict, file, indent=4, sort_keys=True, allow_nan=False) + json.dump(self.to_json(), file, indent=4, sort_keys=True, allow_nan=False) # Updating metadata self.metadata.creator = creator_name @@ -1005,3 +1048,24 @@ def save_baselines(filename, baselines, ratio=(1, 1), initial_shape=None): image_height=int(initial_shape[0]*ratio[0]) if initial_shape is not None else None, image_width=int(initial_shape[1]*ratio[1]) if initial_shape is not None else None) page.write_to_file(filename) + + +def get_unique_tags_from_xml_text_regions(xml_filename: str, + tag_pattern: str='{type:.*;}'): + """ + Get a list of all the values of labels/tags + + :param xml_filename: filename of the xml file + :param tag_pattern: regular expression pattern to look for in `TextRegion.custom_attribute` + :return: + """ + tagset = list() + page = parse_file(xml_filename) + for tr in page.text_regions: + custom_attribute = tr.custom_attribute + matches = re.findall(tag_pattern, custom_attribute) + assert len(matches) <= 1, "Found multiple matches in {}".format(custom_attribute) + if matches: + tagset.append(matches[0][6:-2]) + + return list(np.unique(tagset)) diff --git a/dh_segment/io/__init__.py b/dh_segment/io/__init__.py index ca0fabb..531ba66 100644 --- a/dh_segment/io/__init__.py +++ b/dh_segment/io/__init__.py @@ -63,6 +63,40 @@ PAGE.json_serialize ---- + +.. _ref_via: + +VGG Image Annotator helpers +--------------------------- + + +**VIA objects** + +.. autosummary:: + via.WorkingItem + via.VIAttribute + + +**Creating masks with VIA annotations** + +.. autosummary:: + via.load_annotation_data + via.export_annotation_dict + via.get_annotations_per_file + via.parse_via_attributes + via.get_via_attributes + via.collect_working_items + via.create_masks + + +**Formatting in VIA JSON format** + +.. autosummary:: + via.create_via_region_from_coordinates + via.create_via_annotation_single_image + +---- + """ @@ -103,3 +137,5 @@ from .input import * from .input_utils import * from . import PAGE +from . import via + diff --git a/dh_segment/io/input.py b/dh_segment/io/input.py index 453ca2d..eaf5bad 100644 --- a/dh_segment/io/input.py +++ b/dh_segment/io/input.py @@ -17,9 +17,9 @@ class InputCase(Enum): INPUT_CSV = 'INPUT_CSV' -def input_fn(input_data: Union[str, List[str]], params: dict, input_label_dir: str=None, - data_augmentation: bool=False, batch_size: int=5, make_patches: bool=False, num_epochs: int=1, - num_threads: int=4, image_summaries: bool=False): +def input_fn(input_data: Union[str, List[str]], params: dict, input_label_dir: str = None, + data_augmentation: bool = False, batch_size: int = 5, make_patches: bool = False, num_epochs: int = 1, + num_threads: int = 4, image_summaries: bool = False, progressbar_description: str = 'Dataset'): """ Input_fn for estimator @@ -33,6 +33,7 @@ def input_fn(input_data: Union[str, List[str]], params: dict, input_label_dir: s :param num_epochs: number of epochs to cycle trough data (set it to None for infinite repeat) :param num_threads: number of thread to use in parallele when usin tf.data.Dataset.map :param image_summaries: boolean, whether to make tf.Summary to watch on tensorboard + :param progressbar_description: what will appear in the progressbar showing the number of files read :return: fn """ training_params = utils.TrainingParams.from_dict(params['training_params']) @@ -96,8 +97,9 @@ def _scaling_and_patch_fn(input_image, label_image): # Data augmentation def _augment_data_fn(input_image, label_image): \ - return data_augmentation_fn(input_image, label_image, training_params.data_augmentation_flip_lr, - training_params.data_augmentation_flip_ud, training_params.data_augmentation_color) + return data_augmentation_fn(input_image, label_image, training_params.data_augmentation_flip_lr, + training_params.data_augmentation_flip_ud, + training_params.data_augmentation_color) # Assign color to class id def _assign_color_to_class_id(input_image, label_image): @@ -112,13 +114,14 @@ def _assign_color_to_class_id(input_image, label_image): output['weight_maps'] = local_entropy(tf.equal(label_image, 1), sigma=training_params.local_entropy_sigma) return output + # --- # Finding the list of images to be used if isinstance(input_data, list): input_case = InputCase.INPUT_LIST input_image_filenames = input_data - print('Found {} images'.format(len(input_image_filenames))) + #print('Found {} images'.format(len(input_image_filenames))) elif os.path.isdir(input_data): input_case = InputCase.INPUT_DIR @@ -126,13 +129,14 @@ def _assign_color_to_class_id(input_image, label_image): recursive=True) + \ glob(os.path.join(input_data, '**', '*.png'), recursive=True) - print('Found {} images'.format(len(input_image_filenames))) + #print('Found {} images'.format(len(input_image_filenames))) elif os.path.isfile(input_data) and \ input_data.endswith('.csv'): input_case = InputCase.INPUT_CSV else: - raise NotImplementedError('Input data should be a directory, a csv file or a list of filenames but got {}'.format(input_data)) + raise NotImplementedError( + 'Input data should be a directory, a csv file or a list of filenames but got {}'.format(input_data)) # Finding the list of labelled images if available has_labelled_data = False @@ -161,23 +165,24 @@ def _assign_color_to_class_id(input_image, label_image): if not os.path.exists(img_filename): raise FileNotFoundError(img_filename) if has_labelled_data: - for img_filename in input_image_filenames: - if not os.path.exists(img_filename): - raise FileNotFoundError(img_filename) + for label_filename in label_image_filenames: + if not os.path.exists(label_filename): + raise FileNotFoundError(label_filename) # Tensorflow input_fn def fn(): if not has_labelled_data: encoded_filenames = [f.encode() for f in input_image_filenames] - dataset = tf.data.Dataset.from_generator(lambda: tqdm(encoded_filenames, desc='Dataset'), + dataset = tf.data.Dataset.from_generator(lambda: tqdm(encoded_filenames, desc=progressbar_description), tf.string, tf.TensorShape([])) dataset = dataset.repeat(count=num_epochs) dataset = dataset.map(lambda filename: {'images': load_and_resize_image(filename, 3, training_params.input_resized_size)}) else: encoded_filenames = [(i.encode(), l.encode()) for i, l in zip(input_image_filenames, label_image_filenames)] - dataset = tf.data.Dataset.from_generator(lambda: tqdm(utils.shuffled(encoded_filenames), desc='Dataset'), - (tf.string, tf.string), (tf.TensorShape([]), tf.TensorShape([]))) + dataset = tf.data.Dataset.from_generator(lambda: tqdm(utils.shuffled(encoded_filenames), + desc=progressbar_description), + (tf.string, tf.string), (tf.TensorShape([]), tf.TensorShape([]))) dataset = dataset.repeat(count=num_epochs) dataset = dataset.map(_load_image_fn, num_threads).flat_map(_scaling_and_patch_fn) @@ -193,6 +198,8 @@ def fn(): if make_patches and input_label_dir: base_shape_images = list(training_params.patch_shape) + elif make_patches and input_case == InputCase.INPUT_CSV: + base_shape_images = list(training_params.patch_shape) else: base_shape_images = [-1, -1] # Pad things diff --git a/dh_segment/io/via.py b/dh_segment/io/via.py new file mode 100644 index 0000000..a42e85a --- /dev/null +++ b/dh_segment/io/via.py @@ -0,0 +1,959 @@ +#!/usr/bin/env python +# coding: utf-8 + +__author__ = "maudehrmann, solivr" +__license__ = "GPL" + +import json +import os +import re +from tqdm import tqdm +import numpy as np +from skimage import transform +from collections import namedtuple +from imageio import imsave, imread +import requests +from PIL import Image +from itertools import filterfalse, chain +from typing import List, Tuple, Dict +import cv2 +from . import PAGE + + +# To define before using the corresponding functions +# iiif_password = os.environ["IIIF_PWD"] +iiif_password = '' + + +WorkingItem = namedtuple( + "WorkingItem", [ + 'collection', + 'image_name', + 'original_x', + 'original_y', + 'reduced_x', + 'reduced_y', + 'iiif', + 'annotations' + ] +) +WorkingItem.__doc__ = """ +A container for annotated images. + +:param str collection: name of the collection +:param str image_name: name of the image +:param int original_x: original image x size (width) +:param int original_y: original image y size (height) +:param int reduced_x: resized x size +:param int reduced_y: resized y size +:param str iiif: iiif url +:param dict annotations: VIA 'region_attributes' +""" + + +VIAttribute = namedtuple( + "VIAttribute", [ + 'name', + 'type', + 'options' + ] +) +VIAttribute.__doc__ = """ +A container for VIA attributes. + +:param str name: The name of attribute +:param str type: The type of the annotation (dropdown, markbox, ...) +:param list options: The options / labels possible for this attribute. +""" + + +def parse_via_attributes(via_attributes: dict) -> List[VIAttribute]: + """ + Parses the VIA attribute dictionary and returns a list of VIAttribute instances + + :param via_attributes: attributes from VIA annotation ('_via_attributes' field) + :return: list of ``VIAttribute`` + """ + + if {'file', 'region'}.issubset(set(via_attributes.keys())): + via_attributes = via_attributes['region'] + + list_attributes = list() + for k, v in via_attributes.items(): + if v['type'] == 'text': + print('WARNING : Please do not use text type for attributes because it is more prone to errors/typos which ' + 'can make the parsing fail. Use instead "checkbox", "dropdown" or "radio" with defined options.') + options = None + else: + options = list(v['options'].keys()) + + list_attributes.append(VIAttribute(k, + v['type'], + options)) + + return list_attributes + + +def get_annotations_per_file(via_dict: dict, name_file: str) -> dict: + """ + From VIA json content, get annotations relative to the given `name_file`. + + :param via_dict: VIA annotations content (originally json) + :param name_file: the file to look for (it can be a iiif path or a file path) + :return: dict + """ + + # Check that the annotation_dict is a "via_project" file (project export), + # or a "via_region" file (annotation export) + if '_via_img_metadata' in via_dict.keys(): + annotation_dict = via_dict['_via_img_metadata'] + else: + annotation_dict = via_dict + + # If it looks like a iiif path add "-1" + if 'http' in name_file: + key = name_file + "-1" + else: + # find the key that contains the name_file + list_keys = list(filterfalse(lambda x: name_file not in x, list(annotation_dict.keys()))) + assert len(list_keys) == 1, "There is more than one key for the file '{} : \n{}'".format(name_file, list_keys) + key = list_keys[0] + + if key in annotation_dict.keys(): + myannotation = annotation_dict[key] + if name_file == myannotation['filename']: + return myannotation['regions'] + else: + return None + + +def _compute_reduced_dimensions(x: int, y: int, target_h: int=2000) -> Tuple[int, int]: + """ + Compute new dimensions with height set to `target_h`. + + :param x: height + :param y: width + :param target_h: target height + :return: tuple + """ + ratio = y / x + target_w = int(target_h * ratio) + return target_h, target_w + + +def _collect_working_items_from_local_images(via_annotations: dict, images_dir: str, collection_name: str) \ + -> List[WorkingItem]: + """ + Given VIA annotation input, collect all info on `WorkingItem` object, when images come from local files + + :param via_annotations: via_annotations: via annotations ('regions' field) + :param images_dir: directory where to find the images + :param collection_name: name of the collection + :return: + """ + + def _formatting(name_id: str) -> str: + name_id = re.sub('.jpg\d*', '.jpg', name_id) + name_id = re.sub('.png\d*', '.png', name_id) + return name_id + + def _get_image_shape_without_loading(filename: str) -> Tuple[int, int]: + image = Image.open(filename) + shape = image.size + image.close() + return shape + + working_items = list() + + for key, v in tqdm(via_annotations.items()): + filename = _formatting(key) + + absolute_filename = os.path.join(images_dir, filename) + shape_image = _get_image_shape_without_loading(absolute_filename) + + regions = v['regions'] + + if regions: + wk_item = WorkingItem(collection=collection_name, + image_name=filename.split('.')[0], + original_x=shape_image[0], + original_y=shape_image[1], + reduced_x=None, + reduced_y=None, + iiif=None, + annotations=regions) + + working_items.append(wk_item) + + return working_items + + +def _collect_working_items_from_iiif(via_annotations: dict, collection_name: str, iiif_user='my-team') -> dict: + """ + Given VIA annotation input, collect all info on `WorkingItem` object, when the images come from IIIF urls + + :param via_annotations: via_annotations: via annotations ('regions' field) + :param collection_name: name of the collection + :param iiif_user: user param for requests.Session().get() + :return: + """ + + working_items = list() + session = requests.Session() + + for key, v in tqdm(via_annotations.items()): + iiif_url = v['filename'] + + image_name = os.path.basename(iiif_url.split('/full/full/')[0]) + + # get image dimensions + iiif_json = iiif_url.replace("default.jpg", "info.json") + resp_json = session.get(iiif_json, auth=(iiif_user, iiif_password)) + if resp_json.status_code == requests.codes.ok: + y = resp_json.json()['height'] + x = resp_json.json()['width'] + # target_h, target_w = _compute_reduced_dimensions(x, y) + target_h, target_w = None, None + else: + x, y, target_w, target_h = None, None, None, None + resp_json.raise_for_status() + + regions = v['regions'] + + if regions: + wk_item = WorkingItem(collection=collection_name, + image_name=image_name.split('.')[0], + original_x=x, + original_y=y, + reduced_x=target_w, + reduced_y=target_h, + iiif=iiif_url, + annotations=regions) + + working_items.append(wk_item) + + return working_items + + +def collect_working_items(via_annotations: dict, collection_name: str, images_dir: str=None, + via_version: int=2) -> List[WorkingItem]: + """ + Given VIA annotation input, collect all info on `WorkingItem` object. + This function will take care of separating images from local files and images from IIIF urls. + + :param via_annotations: via annotations ('regions' field) + :param images_dir: directory where to find the images + :param collection_name: name of the collection + :param via_version: version of the VIA tool used to produce the annotations (1 or 2) + :return: list of `WorkingItem` + """ + + via_annotations_v2 = via_annotations.copy() + if via_version == 1: + for key, value in via_annotations_v2.items(): + list_regions = list() + for v_region in value['regions'].values(): + list_regions.append(v_region) + via_annotations_v2[key]['regions'] = list_regions + + local_annotations = {k: v for k, v in via_annotations_v2.items() if 'http' not in k} + url_annotations = {k: v for k, v in via_annotations_v2.items() if 'http' in k} + + working_items = list() + if local_annotations: + assert images_dir is not None + working_items += _collect_working_items_from_local_images(local_annotations, images_dir, collection_name) + if url_annotations: + working_items += _collect_working_items_from_iiif(url_annotations, collection_name) + + return working_items + + +def _scale_down_original(working_item, img_out_dir: str) -> None: + """ + Copy and reduce original image files. + + :param img_out_dir: where to put the downscaled images + :param working_item: dict of `WorkingItems` + :return: None + """ + + def _getimage_from_iiif(url, user, pwd): + img = requests.get(url, auth=(user, pwd)) + return imread(img.content) + + image_set_dir = os.path.join(img_out_dir, working_item.collection, "images") + if not os.path.exists(image_set_dir): + try: + os.makedirs(image_set_dir) + except OSError as e: + if e.errno != os.errno.EEXIST: + raise + pass + + outfile = os.path.join(image_set_dir, working_item.image_name + "_ds.png") + if not os.path.isfile(outfile): + img = _getimage_from_iiif(working_item.iiif, 'epfl-team', iiif_password) + img_resized = transform.resize( + img, + [working_item.reduced_y, working_item.reduced_x], + anti_aliasing=False, + preserve_range=True + ) + imsave(outfile, img_resized.astype(np.uint8)) + + +def load_annotation_data(via_data_filename: str, only_img_annotations: bool=False, via_version: int=2) -> dict: + """ + Load the content of via annotation files. + + :param via_data_filename: via annotations json file + :param only_img_annotations: load only the images annotations ('_via_img_metadata' field) + :param via_version: + :return: the content of json file containing the region annotated + """ + + with open(via_data_filename, 'r', encoding='utf8') as f: + content = json.load(f) + if via_version == 2: + assert '_via_img_metadata' in content.keys(), "The file is not a valid VIA project export." + + if only_img_annotations: + return content['_via_img_metadata'] + else: + return content + else: + return content + + +def export_annotation_dict(annotation_dict: dict, filename: str) -> None: + """ + Export the annotations to json file. + + :param annotation_dict: VIA annotations + :param filename: filename to export the data (json file) + :return: + """ + with open(filename, 'w', encoding='utf8') as f: + json.dump(annotation_dict, f) + + +def get_via_attributes(annotation_dict: dict, via_version: int=2) -> List[VIAttribute]: + """ + Gets the attributes of the annotated data and returns a list of `VIAttribute`. + + :param annotation_dict: json content of the VIA exported file + :param via_version: either 1 or 2 (for VIA v 1.0 or VIA v 2.0) + :return: A list containing VIAttributes + """ + + if via_version == 1: + + list_attributes = [list(region['region_attributes'].keys()) + for value in annotation_dict.values() + for region in value['regions'].values()] + + # Find options + unique_attributes = list(np.unique(list(chain.from_iterable(list_attributes)))) + + dict_labels = {rgn_att: list() for rgn_att in unique_attributes} + for value in annotation_dict.values(): + regions = value['regions'] + for region in regions.values(): + for k, v in region['region_attributes'].items(): + dict_labels[k].append(v) + + elif via_version == 2: + + if '_via_attributes' in annotation_dict.keys(): # If project_export is given + return parse_via_attributes(annotation_dict['_via_attributes']) + + else: # else if annotation_export is given + + list_attributes = [list(region['region_attributes'].keys()) + for value in annotation_dict.values() + for region in value['regions']] + + # Find options + unique_attributes = list(np.unique(list(chain.from_iterable(list_attributes)))) + + dict_labels = {rgn_att: list() for rgn_att in unique_attributes} + for value in annotation_dict.values(): + regions = value['regions'] + for region in regions: + for k, v in region['region_attributes'].items(): + dict_labels[k].append(v) + + else: + raise NotImplementedError + + # Instantiate VIAttribute objects + viattribute_list = list() + for attribute, options in dict_labels.items(): + + if all(isinstance(opt, str) for opt in options): + viattribute_list.append(VIAttribute(name=attribute, + type=None, + options=list(np.unique(options)))) + + elif all(isinstance(opt, dict) for opt in options): + viattribute_list.append(VIAttribute(name=attribute, + type=None, + options=list(np.unique(list(chain.from_iterable(options)))))) + + else: + raise NotImplementedError + return viattribute_list + + +def _draw_mask(via_region: dict, mask: np.array, contours_only: bool=False) -> np.array: + """ + + :param via_region: region to draw (in VIA format) + :param mask: image mask to draw on + :param contours_only: if `True`, draws only the contours of the region, if `False`, fills the region + :return: the drawn mask + """ + + shape_attributes_dict = via_region['shape_attributes'] + + if shape_attributes_dict['name'] == 'rect': + x = shape_attributes_dict['x'] + y = shape_attributes_dict['y'] + w = shape_attributes_dict['width'] + h = shape_attributes_dict['height'] + + contours = np.array([[x, y], + [x + w, y], + [x + w, y + h], + [x, y + h] + ]).reshape((-1, 1, 2)) + + mask = cv2.polylines(mask, [contours], True, 255, thickness=15) if contours_only \ + else cv2.fillPoly(mask, [contours], 255) + + elif shape_attributes_dict['name'] == 'polygon': + contours = np.stack([shape_attributes_dict['all_points_x'], + shape_attributes_dict['all_points_y']], axis=1)[:, None, :] + + mask = cv2.polylines(mask, [contours], True, 255, thickness=15) if contours_only \ + else cv2.fillPoly(mask, [contours], 255) + + elif shape_attributes_dict['name'] == 'circle': + center_point = (shape_attributes_dict['cx'], shape_attributes_dict['cy']) + radius = shape_attributes_dict['r'] + + mask = cv2.circle(mask, center_point, radius, 255, thickness=15) if contours_only \ + else cv2.circle(mask, center_point, radius, 255, thickness=-1) + + elif shape_attributes_dict['name'] == 'polyline': + contours = np.stack([shape_attributes_dict['all_points_x'], + shape_attributes_dict['all_points_y']], axis=1)[:, None, :] + + mask = cv2.polylines(mask, [contours], False, 255, thickness=15) + + else: + raise NotImplementedError( + 'Mask annotation for shape of type "{}" has not been implemented yet' + .format(shape_attributes_dict['name'])) + + return mask + + +def _write_mask(mask: np.ndarray, masks_dir: str, collection: str, image_name: str, label: str) -> None: + """ + Save a mask with filename containing 'label'. + + :param mask: mask b&w image (H, W) + :param masks_dir: directory to output mask + :param collection: name of the collection + :param image_name: name of the image + :param label: label of the mask + :return: + """ + + outdir = os.path.join(masks_dir, collection, image_name) + if not os.path.exists(outdir): + os.makedirs(outdir) + label = label.strip(' \n').replace(" ", "_").lower() if label is not None else 'nolabel' + outfile = os.path.join(outdir, image_name + "-mask-" + label + ".png") + imsave(outfile, mask.astype(np.uint8)) + + +def create_masks(masks_dir: str, working_items: List[WorkingItem], via_attributes: List[VIAttribute], + collection: str, contours_only: bool=False) -> dict: + """ + For each annotation, create a corresponding binary mask and resize it (h = 2000). Only valid for VIA 2.0. + Several annotations of the same class on the same image produce one image with several masks. + + :param masks_dir: where to output the masks + :param working_items: infos to work with + :param via_attributes: VIAttributes computed by ``get_via_attributes`` function. + :param collection: name of the nollection + :param contours_only: creates the binary masks only for the contours of the object (thickness of contours : 20 px) + :return: annotation_summary, a dictionary containing a list of labels per image + """ + + def resize_and_write_mask(mask_image: np.ndarray, working_item: WorkingItem, label_item: str) -> None: + """ + Resize only if needed (if working_item.reduced != working_item.original) + + :param mask_image: mask image to write + :param working_item: `WorkingItem` object + :param label_item: label name to append to filename + :return: + """ + + if not working_item.reduced_y and not working_item.reduced_x: + _write_mask(mask_image, masks_dir, collection, working_item.image_name, label_item) + + elif working_item.reduced_x != working_item.original_x and working_item.reduced_y != working_item.original_y: + mask_resized = transform.resize(mask_image, + [working_item.reduced_y, working_item.reduced_x], + anti_aliasing=False, + preserve_range=True, + order=0) + _write_mask(mask_resized, masks_dir, collection, working_item.image_name, label_item) + + else: + _write_mask(mask_image, masks_dir, collection, working_item.image_name, label_item) + # ------------------- + + print("Creating masks in {}...".format(masks_dir)) + + annotation_summary = dict() + + for wi in tqdm(working_items, desc="workingItem2mask"): + labels = list() + + # the image has no annotation, writing a black mask: + if not wi.annotations: + mask = np.zeros([wi.original_y, wi.original_x], np.uint8) + resize_and_write_mask(mask, wi, None) + labels.append("nolabel") + + # check all possible labels for the image and create mask: + else: + for attribute in via_attributes: + for option in attribute.options: + # get annotations that have the current attribute + selected_regions = list(filter(lambda r: attribute.name in r['region_attributes'].keys(), + wi.annotations)) + # get annotations that have the current attribute and option + if selected_regions: + selected_regions = list(filter(lambda r: r['region_attributes'][attribute.name] == option, + selected_regions)) + else: + continue + + if selected_regions: + # create a 0 matrix (black background) + mask = np.zeros([wi.original_y, wi.original_x], np.uint8) + + # nb: if 2 labels are on the same page, they belongs to the same mask + for sr in selected_regions: + mask = _draw_mask(sr, mask, contours_only) + + label = '{}-{}'.format(attribute.name, option).lower() + resize_and_write_mask(mask, wi, label) + # add to existing labels + labels.append(label) + + # write summary: list of existing labels per image + annotation_summary[wi.image_name] = labels + outfile = os.path.join(masks_dir, collection, collection + "-classes.txt") + with open(outfile, 'a') as fh: + for a in annotation_summary: + fh.write(a + "\t" + str(annotation_summary[a]) + "\n") + + print("Done.") + return annotation_summary + + +def _get_coordinates_from_xywh(via_regions: List[dict]) -> List[np.array]: + """ + From VIA region dictionaries, get the coordinates array (N,2) of the annotations + + :param via_regions: + :return: + """ + list_coordinates_regions = list() + for region in via_regions: + shape_attributes_dict = region['shape_attributes'] + if shape_attributes_dict['name'] == 'rect': + x = shape_attributes_dict['x'] + y = shape_attributes_dict['y'] + w = shape_attributes_dict['width'] + h = shape_attributes_dict['height'] + + coordinates = np.array([[x, y], + [x + w, y], + [x + w, y + h], + [x, y + h] + ]) + list_coordinates_regions.append(coordinates) + elif shape_attributes_dict['name'] == 'polygon': + coordinates = np.stack([shape_attributes_dict['all_points_x'], + shape_attributes_dict['all_points_y']], axis=1) + list_coordinates_regions.append(coordinates) + elif shape_attributes_dict['name'] == 'polyline': + coordinates = np.stack([shape_attributes_dict['all_points_x'], + shape_attributes_dict['all_points_y']], axis=1) + list_coordinates_regions.append(coordinates) + else: + raise NotImplementedError( + "This method has not been implemenetd yet for {}".format(shape_attributes_dict['name'])) + + return list_coordinates_regions + + +# EXPORT +# ------ + +def _get_xywh_from_coordinates(coordinates: np.array) -> Tuple[int, int, int, int]: + """ + From coordinates points get x,y, width, height + + :param coordinates: (N,2) coordinates (x,y) + :return: x, y, w, h + """ + + x = np.min(coordinates[:, 0]) + y = np.min(coordinates[:, 1]) + w = np.max(coordinates[:, 0]) - x + h = np.max(coordinates[:, 1]) - y + + return x, y, w, h + + +def create_via_region_from_coordinates(coordinates: np.array, region_attributes: dict, type_region: str) -> dict: + """ + Formats coordinates to a VIA region (dict). + + :param coordinates: (N, 2) coordinates (x, y) + :param region_attributes: dictionary with keys : name of labels, values : values of labels + :param type_region: via region annotation type ('rect', 'polygon') + :return: a region in VIA style (dict/json) + """ + assert type_region in ['rect', 'polygon', 'circle'] + + if type_region == 'rect': + x, y, w, h = _get_xywh_from_coordinates(coordinates) + shape_atributes = { + 'name': 'rect', + 'height': int(h), + 'width': int(w), + 'x': int(x), + 'y': int(y) + } + elif type_region == 'polygon': + points_x = list(coordinates[:, 0]) + points_y = list(coordinates[:, 1]) + + shape_atributes = { + 'name': 'polygon', + 'all_points_x': [int(p) for p in points_x], + 'all_points_y': [int(p) for p in points_y], + } + elif type_region == 'circle': + raise NotImplementedError('The type {} is not supported for the export.'.format(type)) + + return {'region_attributes': region_attributes, + 'shape_attributes': shape_atributes} + + +def create_via_annotation_single_image(img_filename: str, via_regions: List[dict], + file_attributes: dict=None) -> Dict[str, dict]: + """ + Returns a dictionary item {key: annotation} in VIA format to further export to .json file + + :param img_filename: path to the image + :param via_regions: regions in VIA format (output from ``create_via_region_from_coordinates``) + :param file_attributes: file attributes (usually None) + :return: dictionary item with key and annotations in VIA format + """ + if 'http' in img_filename: + basename = img_filename + file_size = -1 + else: + basename = os.path.basename(img_filename) + file_size = os.path.getsize(img_filename) + + via_key = '{}{}'.format(basename, file_size) + + via_annotation = { + 'file_attributes': file_attributes if file_attributes is not None else dict(), + 'filename': basename, + 'size': file_size, + 'regions': via_regions + } + + return {via_key: via_annotation} + + +# PAGE CONVERSION +# --------------- + +def convert_via_region_page_text_region(working_item: WorkingItem, structure_label: str) -> PAGE.Page: + """ + + :param working_item: + :param structure_label: + :return: + """ + + # TODO : this is not yet generic because we're missing the automatic detection of the structure label + + region_coordinates = _get_coordinates_from_xywh(working_item.annotations) + + page = PAGE.Page(image_filename=working_item.image_name + 'jpg', + image_width=working_item.original_x, + image_height=working_item.original_y, + graphic_regions=[ + PAGE.TextRegion(coords=PAGE.Point.array_to_point(coords), + custom_attribute='structure{{type:{};}}'.format(structure_label)) + for coords in region_coordinates]) + return page + + +# def convert_via_region_to_text_region(via_regions: List[dict], structure_label: str) -> PAGE.TextRegion: +# """ +# +# :param via_region: +# :param structure_label: +# :return: +# """ +# +# # TODO : this is not yet generic because we're missing the automatic detection of the structure label +# +# region_coordinates = _get_coordinates_from_xywh(working_item.annotations) +# +# page = PAGE.Page(image_filename=working_item.image_name + 'jpg', +# image_width=working_item.original_x, +# image_height=working_item.original_y, +# graphic_regions=[ +# PAGE.TextRegion(coords=PAGE.Point.array_to_point(coords), +# custom_attribute='structure{{type:{};}}'.format(structure_label)) +# for coords in region_coordinates]) +# return page + + +""" +Example of usage + + +collection = 'mycollection' +annotation_file = 'via_sample.json' +masks_dir = '/home/project/generated_masks' +images_dir = './my_images' + +# Load all the data in the annotation file (the file may be an exported project or an export of the annotations) +via_data = load_annotation_data(annotation_file) + +# In the case of an exported project file, you can set ``only_img_annotations=True`` to get only +# the region annotations +via_annotations = load_annotation_data(annotation_file, only_img_annotations=True) + +# Collect the annotated regions +working_items = collect_working_items(via_annotations, collection, images_dir) + +# Collect the attributes and options +if '_via_attributes' in via_data.keys(): + list_attributes = parse_via_attributes(via_data['_via_attributes']) +else: + list_attributes = get_via_attributes(via_annotations) + +# Create one mask per option per attribute +create_masks(masks_dir, wi,via_attributes, collection) +""" + + +""" +Content of a via_project exported file + +{'_via_attributes': { + ... + }, + '_via_img_metadata': { + ... + }, + '_via_settings': { + 'core': { + 'buffer_size': 18, + 'default_filepath': '', + 'filepath': {} + }, + 'project': { + 'name': 'via_project_7Feb2019_10h7m' + }, + 'ui': { + 'annotation_editor_fontsize': 0.8, + 'annotation_editor_height': 25, + 'image': { + 'region_label': 'region_id', + 'region_label_font': '10px Sans' + }, + 'image_grid': { + 'img_height': 80, + 'rshape_fill': 'none', + 'rshape_fill_opacity': 0.3, + 'rshape_stroke': 'yellow', + 'rshape_stroke_width': 2, + 'show_image_policy': 'all', + 'show_region_shape': True + }, + 'leftsidebar_width': 18 + } + } +} + +""" + +""" +"_via_attributes": { + "region": { + "attribute1": { + "type":"text", + "description":"", + "default_value":"" + }, + "attribute2": { + "type":"dropdown", + "description":"", + "options": { + "op1":"", + "op2":"" + }, + "default_options":{} + }, + "attribute3": { + "type":"checkbox", + "description":"", + "options": { + "op1":"", + "op2":"" + }, + "default_options":{} + }, + "attribute 4": { + "type":"radio", + "description":"", + "options": { + "op1":"", + "op2":"" + }, + "default_options":{} + } + }, + "file":{} +} + +""" + +""" +'_via_img_metadata': { + 'image_filename1.jpg2209797': { + 'file_attributes': {}, + 'filename': 'image_filename1.jpg', + 'regions': + [{ + 'region_attributes': { + 'attribute1': { + 'op1': True, + 'op2': True + }, + 'attribute 2': 'label1', + 'attribute 3': 'op1' + }, + 'shape_attributes': { + 'height': 2277, + 'name': 'rect', + 'width': 1541, + 'x': 225, + 'y': 458 + } + }, + { + 'region_attributes': { + 'attribute 4': 'op1', + 'attribute 1': {}, + 'attribute 2': 'label1', + 'attribute 3': 'op2' + }, + 'shape_attributes': { + 'height': 2255, + 'name': 'rect', + 'width': 1554, + 'x': 1845, + 'y': 476 + } + }], + 'size': 2209797}, + 'https://libimages.princeton.edu/loris/pudl0001/5138415/00000011.jp2/full/full/0/default.jpg-1': { + 'file_attributes': {}, + 'filename': 'https://libimages.princeton.edu/loris/pudl0001/5138415/00000011.jp2/full/full/0/default.jpg', + 'regions': + [{ + 'region_attributes': { + 'attribute 4': 'op2', + 'attribute 1': { + 'op1': True + }, + 'attribute 2': 'label3', + 'attribute 3': 'op1' + }, + 'shape_attributes': { + 'height': 1026, + 'name': 'rect', + 'width': 1430, + 'x': 145, + 'y': 525 + } + }, + { + 'region_attributes': { + 'attribute 4': 'op2', + 'attribute 1': { + 'op1': True}, + 'attribute 2': 'label 3 ', + 'attribute 3': 'op1', + }, + 'shape_attributes': { + 'all_points_x': [2612, 2498, 2691, 2757, 2962, 3034, 2636], + 'all_points_y': [5176, 5616, 5659, 5363, 5375, 5110, 5122], + 'name': 'polygon' + } + }, + { + 'region_attributes': { + 'attribute 4': 'op2', + 'attribute 1': { + 'op1': True}, + 'attribute 2': 'label 3 ', + 'attribute 3': 'op1', + }, + 'shape_attributes': { + 'cx': 2793, + 'cy': 881, + 'name': 'circle', + 'r': 524 + } + }, + { + 'region_attributes': { + 'attribute 4': 'op1', + 'attribute 1': { + 'op2': True}, + 'attribute 2': 'label1', + 'attribute 3': 'op2', + }, + 'shape_attributes': { + 'all_points_x': [3246, 5001], + 'all_points_y': [422, 380], + 'name': 'polyline' + } + }], + 'size': -1 + } +} +""" diff --git a/dh_segment/network/__init__.py b/dh_segment/network/__init__.py index 553184c..e5aa398 100644 --- a/dh_segment/network/__init__.py +++ b/dh_segment/network/__init__.py @@ -1,12 +1,14 @@ _MODEL = [ - 'inference_vgg16', - 'inference_resnet_v1_50', - 'inference_u_net', - 'vgg_16_fn', - 'resnet_v1_50_fn' + 'Encoder', + 'Decoder', + 'SimpleDecoder', ] -__all__ = _MODEL +_PRETRAINED = [ + 'ResnetV1_50', + 'VGG16' +] +__all__ = _MODEL + _PRETRAINED from .model import * from .pretrained_models import * diff --git a/dh_segment/network/model.py b/dh_segment/network/model.py index cc55bae..b20de04 100644 --- a/dh_segment/network/model.py +++ b/dh_segment/network/model.py @@ -1,310 +1,136 @@ #!/usr/bin/env python import tensorflow as tf -from ..utils import ModelParams from tensorflow.contrib import layers # TODO migration to tf.layers ? -from tensorflow.contrib.slim.nets import resnet_v1 from tensorflow.contrib.slim import arg_scope -from .pretrained_models import vgg_16_fn, resnet_v1_50_fn -from collections import OrderedDict +from abc import ABC, abstractmethod +from typing import List, Union, Tuple, Optional, Dict -def inference_vgg16(images: tf.Tensor, params: ModelParams, num_classes: int, use_batch_norm=False, weight_decay=0.0, - is_training=False) -> tf.Tensor: - with tf.name_scope('vgg_augmented'): - - if use_batch_norm: - if params.batch_renorm: - renorm_clipping = {'rmax': 100, 'rmin': 0.1, 'dmax': 10} - renorm_momentum = 0.98 - else: - renorm_clipping = None - renorm_momentum = 0.99 - batch_norm_fn = lambda x: tf.layers.batch_normalization(x, axis=-1, training=is_training, name='batch_norm', - renorm=params.batch_renorm, - renorm_clipping=renorm_clipping, - renorm_momentum=renorm_momentum) - else: - batch_norm_fn = None - - def upsample_conv(pooled_layer, previous_layer, layer_params, number): - with tf.name_scope('deconv{}'.format(number)): - if previous_layer.get_shape()[1].value and previous_layer.get_shape()[2].value: - target_shape = previous_layer.get_shape()[1:3] - else: - target_shape = tf.shape(previous_layer)[1:3] - upsampled_layer = tf.image.resize_images(pooled_layer, target_shape, - method=tf.image.ResizeMethod.BILINEAR) - input_tensor = tf.concat([upsampled_layer, previous_layer], 3) - - for i, (nb_filters, filter_size) in enumerate(layer_params): - input_tensor = layers.conv2d( - inputs=input_tensor, - num_outputs=nb_filters, - kernel_size=[filter_size, filter_size], - normalizer_fn=batch_norm_fn, - scope="conv{}_{}".format(number, i + 1) - ) - return input_tensor +class Encoder(ABC): + @abstractmethod + def __call__(self, images: tf.Tensor, is_training=False) -> List[tf.Tensor]: + """ - # Original VGG : - vgg_net, intermediate_levels = vgg_16_fn(images, blocks=5, weight_decay=weight_decay) - out_tensor = vgg_net + :param images: [NxHxWx3] float32 [0..255] input images + :return: a list of the feature maps in decreasing spatial resolution (first element is most likely the input \ + image itself, then the output of the first pooling op, etc...) + """ + pass - # Intermediate convolution - if params.intermediate_conv is not None: - with tf.name_scope('intermediate_convs'): - for layer_params in params.intermediate_conv: - for k, (nb_filters, filter_size) in enumerate(layer_params): - out_tensor = layers.conv2d(inputs=out_tensor, - num_outputs=nb_filters, - kernel_size=[filter_size, filter_size], - normalizer_fn=batch_norm_fn, - scope='conv_{}'.format(k + 1)) + def pretrained_information(self) -> Tuple[Optional[str], Union[None, List, Dict]]: + """ - # Upsampling : - with tf.name_scope('upsampling'): - selected_upscale_params = [l for i, l in enumerate(params.upscale_params) - if params.selected_levels_upscaling[i]] + :return: The filename of the pretrained checkpoint and the corresponding variables (List of Dict mapping) \ + or `None` if no-pretraining is done + """ + return None, None - assert len(params.selected_levels_upscaling) == len(intermediate_levels), \ - 'Upscaling : {} is different from {}'.format(len(params.selected_levels_upscaling), - len(intermediate_levels)) - selected_intermediate_levels = [l for i, l in enumerate(intermediate_levels) - if params.selected_levels_upscaling[i]] +class Decoder(ABC): + @abstractmethod + def __call__(self, feature_maps: List[tf.Tensor], num_classes: int, is_training=False) -> tf.Tensor: + """ - # Upsampling loop - n_layer = 1 - for i in reversed(range(len(selected_intermediate_levels))): - out_tensor = upsample_conv(out_tensor, selected_intermediate_levels[i], - selected_upscale_params[i], n_layer) - n_layer += 1 + :param feature_maps: list of feature maps, in decreasing spatial resolution, first one being at the original \ + resolution + :return: [N,H,W,num_classes] float32 tensor of logit scores + """ + pass + + +class SimpleDecoder(Decoder): + """ + + :ivar upsampling_dims: + :ivar max_depth: + :ivar weight_decay: + :ivar self.batch_norm_fn: + """ + def __init__(self, upsampling_dims: List[int], max_depth: int = None, weight_decay: float=0.): + self.upsampling_dims = upsampling_dims + self.max_depth = max_depth + self.weight_decay = weight_decay + renorm = True + self.batch_norm_params = { + "renorm": renorm, + "renorm_clipping": {'rmax': 100, 'rmin': 0.1, 'dmax': 10}, + "renorm_momentum": 0.98 + } + + def __call__(self, feature_maps: List[tf.Tensor], num_classes: int, is_training=False): + + batch_norm_fn = lambda x: tf.layers.batch_normalization(x, axis=-1, training=is_training, + name='batch_norm', **self.batch_norm_params) + + # Upsampling + with tf.variable_scope('SimpleDecoder'): + with arg_scope([layers.conv2d], + normalizer_fn=batch_norm_fn, + weights_regularizer=layers.l2_regularizer(self.weight_decay)): + + assert len(self.upsampling_dims) + 1 == len(feature_maps), \ + 'Upscaling : length of {} does not match {}'.format(len(self.upsampling_dims), + len(feature_maps)) + + # Force layers to not be too big to reduce memory usage + for i, l in enumerate(feature_maps): + if self.max_depth and l.get_shape()[-1] > self.max_depth: + feature_maps[i] = layers.conv2d( + inputs=l, + num_outputs=self.max_depth, + kernel_size=[1, 1], + scope="dimreduc_{}".format(i), + normalizer_fn=batch_norm_fn, + activation_fn=None + ) + + # Deconvolving loop + out_tensor = feature_maps[-1] + for i, f_map in reversed(list(enumerate(feature_maps[:-1]))): + out_tensor = _upsample_concat(out_tensor, f_map, scope_name='upsample_{}'.format(i)) + out_tensor = layers.conv2d(inputs=out_tensor, + num_outputs=self.upsampling_dims[i], + kernel_size=[3, 3], + scope="conv_{}".format(i)) logits = layers.conv2d(inputs=out_tensor, num_outputs=num_classes, activation_fn=None, kernel_size=[1, 1], - scope="conv{}-logits".format(n_layer)) - - return logits # [B,h,w,Classes] - - -def inference_resnet_v1_50(images, params, num_classes, use_batch_norm=False, weight_decay=0.0, - is_training=False) -> tf.Tensor: - if use_batch_norm: - if params.batch_renorm: - renorm_clipping = {'rmax': 100, 'rmin': 0.1, 'dmax': 1} - renorm_momentum = 0.98 - else: - renorm_clipping = None - renorm_momentum = 0.99 - batch_norm_fn = lambda x: tf.layers.batch_normalization(x, axis=-1, training=is_training, name='batch_norm', - renorm=params.batch_renorm, - renorm_clipping=renorm_clipping, - renorm_momentum=renorm_momentum) - else: - batch_norm_fn = None - - def upsample_conv(input_tensor, previous_intermediate_layer, layer_params, number) -> tf.Tensor: - """ - Deconvolution (upscaling) layers - - :param input_tensor: - :param previous_intermediate_layer: - :param layer_params: - :param number: - :return: - """ - with tf.variable_scope('deconv_{}'.format(number)): - if previous_intermediate_layer.get_shape()[1].value and \ - previous_intermediate_layer.get_shape()[2].value: - target_shape = previous_intermediate_layer.get_shape()[1:3] - else: - target_shape = tf.shape(previous_intermediate_layer)[1:3] - upsampled_layer = tf.image.resize_images(input_tensor, target_shape, - method=tf.image.ResizeMethod.BILINEAR) - net = tf.concat([upsampled_layer, previous_intermediate_layer], 3) - - filter_size, nb_bottlenecks = layer_params - if nb_bottlenecks > 0: - for i in range(nb_bottlenecks): - net = resnet_v1.bottleneck( - inputs=net, - depth=filter_size, - depth_bottleneck=filter_size // 4, - stride=1 - ) - else: - net = layers.conv2d( - inputs=net, - num_outputs=filter_size, - kernel_size=[3, 3], - scope="conv{}".format(number) - ) - - return net - - # Original ResNet - blocks_needed = max([i for i, is_needed in enumerate(params.selected_levels_upscaling) if is_needed]) - resnet_net, intermediate_layers = resnet_v1_50_fn(images, is_training=False, blocks=blocks_needed, - weight_decay=weight_decay, renorm=False, - corrected_version=params.correct_resnet_version) - - # Upsampling - with tf.variable_scope('upsampling'): - with arg_scope([layers.conv2d], - normalizer_fn=batch_norm_fn, - weights_regularizer=layers.l2_regularizer(weight_decay)): - selected_upscale_params = [l for i, l in enumerate(params.upscale_params) - if params.selected_levels_upscaling[i]] - - assert len(selected_upscale_params) == len(intermediate_layers), \ - 'Upscaling : {} is different from {}'.format(len(selected_upscale_params), - len(intermediate_layers)) - - selected_intermediate_levels = [l for i, l in enumerate(intermediate_layers) - if params.selected_levels_upscaling[i]] - - # Rescaled image values to [0,1] - selected_intermediate_levels.insert(0, images/255.0) - - # Force layers to not be too big to reduce memory usage - for i, l in enumerate(selected_intermediate_levels): - if l.get_shape()[-1] > params.max_depth: - selected_intermediate_levels[i] = layers.conv2d( - inputs=l, - num_outputs=params.max_depth, - kernel_size=[1, 1], - scope="dimreduc_{}".format(i), - # normalizer_fn=batch_norm_fn, - activation_fn=None - ) + scope="conv-logits") - # Deconvolving loop - out_tensor = selected_intermediate_levels[-1] - n_layer = 1 - for i in reversed(range(len(selected_intermediate_levels) - 1)): - out_tensor = upsample_conv(out_tensor, selected_intermediate_levels[i], - selected_upscale_params[i], n_layer) + return logits - n_layer += 1 - if images.get_shape()[1].value and images.get_shape()[2].value: - target_shape = images.get_shape()[1:3] - else: - target_shape = tf.shape(images)[1:3] - out_tensor = tf.image.resize_images(out_tensor, target_shape, - method=tf.image.ResizeMethod.BILINEAR) +def _get_image_shape_tensor(tensor: tf.Tensor) -> Union[Tuple[int, int], tf.Tensor]: + """ + Get the image shape of the tensor - logits = layers.conv2d(inputs=out_tensor, - num_outputs=num_classes, - activation_fn=None, - kernel_size=[1, 1], - scope="conv{}-logits".format(n_layer)) - - return logits - - -def conv_bn_layer(input_tensor, kernel_size, output_channels, stride=1, bn=False, - is_training=True, relu=True): - # with tf.variable_scope(name) as scope: - conv_layer = layers.conv2d(inputs=input_tensor, - num_outputs=output_channels, - kernel_size=kernel_size, - stride=stride, - activation_fn=tf.identity, - padding='SAME') - if bn and relu: - # How to use Batch Norm: https://github.com/martin-gorner/tensorflow-mnist-tutorial/blob/master/README_BATCHNORM.md - - # Why scale is false when using ReLU as the next activation - # https://datascience.stackexchange.com/questions/22073/why-is-scale-parameter-on-batch-normalization-not-needed-on-relu/22127 - - # Using fuse operation: https://www.tensorflow.org/performance/performance_guide#common_fused_ops - conv_layer = layers.batch_norm(inputs=conv_layer, center=True, scale=False, is_training=is_training, fused=True) - conv_layer = tf.nn.relu(conv_layer) - - if bn and not relu: - conv_layer = layers.batch_norm(inputs=conv_layer, center=True, scale=True, is_training=is_training) - - # print('Conv layer {0} -> {1}'.format(input_tensor.get_shape().as_list(),conv_layer.get_shape().as_list())) - return conv_layer - - -def _get_image_shape_tensor(tensor: tf.Tensor): + :param tensor: Input image tensor [N,H,W,...] + :return: a (int, int) tuple if shape is defined, otherwise the corresponding tf.Tensor value + """ if tensor.get_shape()[1].value and \ - tensor.get_shape()[2].value: + tensor.get_shape()[2].value: target_shape = tensor.get_shape()[1:3] else: target_shape = tf.shape(tensor)[1:3] return target_shape -def inference_u_net(images: tf.Tensor, params: ModelParams, num_classes: int, use_batch_norm=False, weight_decay=0.0, - is_training=False) -> tf.Tensor: - enc_layers = OrderedDict() - dec_layers = OrderedDict() - - with tf.variable_scope('U-Net'): - - with tf.variable_scope('Encoder'): - - conv_layer = layers.conv2d(images, num_outputs=64, kernel_size=(3, 3), padding='SAME', - activation_fn=tf.identity) - - enc_layers['conv_layer_enc_64'] = conv_bn_layer(conv_layer, kernel_size=(3, 3), - output_channels=64, - bn=True, is_training=is_training, relu=True) - - conv_layer = layers.max_pool2d(inputs=enc_layers['conv_layer_enc_64'], kernel_size=(2, 2), stride=2) - - for n_feat in [128, 256, 512]: - enc_layers['conv_layer_enc_' + str(n_feat)] = conv_bn_layer(conv_layer, kernel_size=(3, 3), - output_channels=n_feat, - bn=True, - is_training=is_training, relu=True) - - enc_layers['conv_layer_enc_' + str(n_feat)] = conv_bn_layer( - enc_layers['conv_layer_enc_' + str(n_feat)], kernel_size=(3, 3), - output_channels=n_feat, - bn=True, is_training=is_training, relu=True) - - conv_layer = layers.max_pool2d(inputs=enc_layers['conv_layer_enc_' + str(n_feat)], kernel_size=(2, 2), stride=2) - - conv_layer_enc_1024 = conv_bn_layer(conv_layer, kernel_size=(3, 3), - output_channels=1024, - bn=True, is_training=is_training, relu=True) - - with tf.variable_scope('Decoder'): - dec_layers['conv_layer_dec_512'] = conv_bn_layer(conv_layer_enc_1024, kernel_size=(3, 3), - output_channels=512, - bn=True, is_training=is_training, relu=True) - - reduced_patchsize = _get_image_shape_tensor(enc_layers['conv_layer_enc_512']) - dec_layers['conv_layer_dec_512'] = tf.image.resize_images(dec_layers['conv_layer_dec_512'], size=reduced_patchsize, - method=tf.image.ResizeMethod.BILINEAR) - - for n_feat in [512, 256, 128, 64]: - - dec_layers['conv_layer_dec_' + str(n_feat * 2)] = tf.concat([dec_layers['conv_layer_dec_' + str(n_feat)], - enc_layers['conv_layer_enc_' + str(n_feat)]], - axis=3) - dec_layers['conv_layer_dec_' + str(n_feat)] = conv_bn_layer( - dec_layers['conv_layer_dec_' + str(n_feat * 2)], kernel_size=(3, 3), - output_channels=n_feat, - bn=True, is_training=is_training, relu=True) - if n_feat > 64: - dec_layers['conv_layer_dec_' + str(int(n_feat / 2))] = conv_bn_layer( - dec_layers['conv_layer_dec_' + str(n_feat)], kernel_size=(3, 3), - output_channels=n_feat / 2, - bn=True, is_training=is_training, relu=True) - - reduced_patchsize = _get_image_shape_tensor(enc_layers['conv_layer_enc_' + str(int(n_feat / 2))]) - dec_layers['conv_layer_dec_' + str(int(n_feat / 2))] = tf.image.resize_images( - dec_layers['conv_layer_dec_' + str(int(n_feat / 2))], - size=reduced_patchsize, - method=tf.image.ResizeMethod.BILINEAR) - - return layers.conv2d(dec_layers['conv_layer_dec_64'], num_outputs=num_classes, kernel_size=(3, 3), - padding='SAME', activation_fn=tf.identity) +def _upsample_concat(pooled_layer: tf.Tensor, previous_layer: tf.Tensor, scope_name: str='UpsampleConcat'): + """ + + :param pooled_layer: [N,H,W,C] coarse layer + :param previous_layer: [N,H',W',C'] fine layer (H'>H, and W'>W) + :param scope_name: + :return: [N,H',W',C+C'] concatenation of upsampled-`pooled_layer` and `previous_layer` + """ + with tf.name_scope(scope_name): + # Upsamples the coarse level + target_shape = _get_image_shape_tensor(previous_layer) + upsampled_layer = tf.image.resize_images(pooled_layer, target_shape, + method=tf.image.ResizeMethod.BILINEAR) + # Concatenate the upsampled-coarse and the other feature_map + input_tensor = tf.concat([upsampled_layer, previous_layer], 3) + return input_tensor diff --git a/dh_segment/network/pretrained_models.py b/dh_segment/network/pretrained_models.py deleted file mode 100644 index 9f69af6..0000000 --- a/dh_segment/network/pretrained_models.py +++ /dev/null @@ -1,123 +0,0 @@ -from tensorflow.contrib import slim, layers -import tensorflow as tf -from tensorflow.contrib.slim import nets -import numpy as np - -_VGG_MEANS = [123.68, 116.78, 103.94] - - -def mean_substraction(input_tensor, means=_VGG_MEANS): - return tf.subtract(input_tensor, np.array(means)[None, None, None, :], name='MeanSubstraction') - - -def vgg_16_fn(input_tensor: tf.Tensor, scope='vgg_16', blocks=5, weight_decay=0.0005) \ - -> (tf.Tensor, list): # list of tf.Tensors (layers) - intermediate_levels = [] - # intermediate_levels.append(input_tensor) - with slim.arg_scope(nets.vgg.vgg_arg_scope(weight_decay=weight_decay)): - with tf.variable_scope(scope, 'vgg_16', [input_tensor]) as sc: - input_tensor = mean_substraction(input_tensor) - intermediate_levels.append(input_tensor) - end_points_collection = sc.original_name_scope + '_end_points' - # Collect outputs for conv2d, fully_connected and max_pool2d. - with slim.arg_scope( - [layers.conv2d, layers.fully_connected, layers.max_pool2d], - outputs_collections=end_points_collection): - net = layers.repeat( - input_tensor, 2, layers.conv2d, 64, [3, 3], scope='conv1') - intermediate_levels.append(net) - net = layers.max_pool2d(net, [2, 2], scope='pool1') - if blocks >= 2: - net = layers.repeat(net, 2, layers.conv2d, 128, [3, 3], scope='conv2') - intermediate_levels.append(net) - net = layers.max_pool2d(net, [2, 2], scope='pool2') - if blocks >= 3: - net = layers.repeat(net, 3, layers.conv2d, 256, [3, 3], scope='conv3') - intermediate_levels.append(net) - net = layers.max_pool2d(net, [2, 2], scope='pool3') - if blocks >= 4: - net = layers.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv4') - intermediate_levels.append(net) - net = layers.max_pool2d(net, [2, 2], scope='pool4') - if blocks >= 5: - net = layers.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv5') - intermediate_levels.append(net) - net = layers.max_pool2d(net, [2, 2], scope='pool5') - - return net, intermediate_levels - - -def resnet_v1_50_fn(input_tensor: tf.Tensor, is_training=False, blocks=4, weight_decay=0.0001, - renorm=True, corrected_version=False) -> tf.Tensor: - with slim.arg_scope(nets.resnet_v1.resnet_arg_scope(weight_decay=weight_decay, batch_norm_decay=0.999)), \ - slim.arg_scope([layers.batch_norm], renorm_decay=0.95, renorm=renorm): - input_tensor = mean_substraction(input_tensor) - assert 0 < blocks <= 4 - - if corrected_version: - def corrected_resnet_v1_block(scope, base_depth, num_units, stride): - """Helper function for creating a resnet_v1 bottleneck block. - - Args: - scope: The scope of the block. - base_depth: The depth of the bottleneck layer for each unit. - num_units: The number of units in the block. - stride: The stride of the block, implemented as a stride in the last unit. - All other units have stride=1. - - Returns: - A resnet_v1 bottleneck block. - """ - return nets.resnet_utils.Block(scope, nets.resnet_v1.bottleneck,[{ - 'depth': base_depth * 4, - 'depth_bottleneck': base_depth, - 'stride': stride - }] + [{ - 'depth': base_depth * 4, - 'depth_bottleneck': base_depth, - 'stride': 1 - }] * (num_units - 1)) - - blocks_list = [ - corrected_resnet_v1_block('block1', base_depth=64, num_units=3, stride=1), - corrected_resnet_v1_block('block2', base_depth=128, num_units=4, stride=2), - corrected_resnet_v1_block('block3', base_depth=256, num_units=6, stride=2), - corrected_resnet_v1_block('block4', base_depth=512, num_units=3, stride=2), - ] - desired_endpoints = [ - 'resnet_v1_50/conv1', - 'resnet_v1_50/block1/unit_3/bottleneck_v1', - 'resnet_v1_50/block2/unit_4/bottleneck_v1', - 'resnet_v1_50/block3/unit_6/bottleneck_v1', - 'resnet_v1_50/block4/unit_3/bottleneck_v1' - ] - else: - blocks_list = [ - nets.resnet_v1.resnet_v1_block('block1', base_depth=64, num_units=3, stride=2), - nets.resnet_v1.resnet_v1_block('block2', base_depth=128, num_units=4, stride=2), - nets.resnet_v1.resnet_v1_block('block3', base_depth=256, num_units=6, stride=2), - nets.resnet_v1.resnet_v1_block('block4', base_depth=512, num_units=3, stride=1), - ] - desired_endpoints = [ - 'resnet_v1_50/conv1', - 'resnet_v1_50/block1/unit_2/bottleneck_v1', - 'resnet_v1_50/block2/unit_3/bottleneck_v1', - 'resnet_v1_50/block3/unit_5/bottleneck_v1', - 'resnet_v1_50/block4/unit_3/bottleneck_v1' - ] - - net, endpoints = nets.resnet_v1.resnet_v1(input_tensor, - blocks=blocks_list[:blocks], - num_classes=None, - is_training=is_training, - global_pool=False, - output_stride=None, - include_root_block=True, - reuse=None, - scope='resnet_v1_50') - - intermediate_layers = list() - for d in desired_endpoints[:blocks + 1]: - intermediate_layers.append(endpoints[d]) - - return net, intermediate_layers diff --git a/dh_segment/network/pretrained_models/__init__.py b/dh_segment/network/pretrained_models/__init__.py new file mode 100644 index 0000000..c95406f --- /dev/null +++ b/dh_segment/network/pretrained_models/__init__.py @@ -0,0 +1,3 @@ +from .resnet50 import ResnetV1_50 +from .vgg16 import VGG16 +from .mobilenet.encoder import MobileNetV2 \ No newline at end of file diff --git a/dh_segment/network/pretrained_models/mobilenet/__init__.py b/dh_segment/network/pretrained_models/mobilenet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dh_segment/network/pretrained_models/mobilenet/conv_blocks.py b/dh_segment/network/pretrained_models/mobilenet/conv_blocks.py new file mode 100644 index 0000000..498ce77 --- /dev/null +++ b/dh_segment/network/pretrained_models/mobilenet/conv_blocks.py @@ -0,0 +1,358 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Convolution blocks for mobilenet.""" +import contextlib +import functools + +import tensorflow as tf + +slim = tf.contrib.slim + + +def _fixed_padding(inputs, kernel_size, rate=1): + """Pads the input along the spatial dimensions independently of input size. + + Pads the input such that if it was used in a convolution with 'VALID' padding, + the output would have the same dimensions as if the unpadded input was used + in a convolution with 'SAME' padding. + + Args: + inputs: A tensor of size [batch, height_in, width_in, channels]. + kernel_size: The kernel to be used in the conv2d or max_pool2d operation. + rate: An integer, rate for atrous convolution. + + Returns: + output: A tensor of size [batch, height_out, width_out, channels] with the + input, either intact (if kernel_size == 1) or padded (if kernel_size > 1). + """ + kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), + kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)] + pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] + pad_beg = [pad_total[0] // 2, pad_total[1] // 2] + pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], + [pad_beg[1], pad_end[1]], [0, 0]]) + return padded_inputs + + +def _make_divisible(v, divisor, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +def _split_divisible(num, num_ways, divisible_by=8): + """Evenly splits num, num_ways so each piece is a multiple of divisible_by.""" + assert num % divisible_by == 0 + assert num / num_ways >= divisible_by + # Note: want to round down, we adjust each split to match the total. + base = num // num_ways // divisible_by * divisible_by + result = [] + accumulated = 0 + for i in range(num_ways): + r = base + while accumulated + r < num * (i + 1) / num_ways: + r += divisible_by + result.append(r) + accumulated += r + assert accumulated == num + return result + + +@contextlib.contextmanager +def _v1_compatible_scope_naming(scope): + if scope is None: # Create uniqified separable blocks. + with tf.variable_scope(None, default_name='separable') as s, \ + tf.name_scope(s.original_name_scope): + yield '' + else: + # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts. + # which provide numbered scopes. + scope += '_' + yield scope + + +@slim.add_arg_scope +def split_separable_conv2d(input_tensor, + num_outputs, + scope=None, + normalizer_fn=None, + stride=1, + rate=1, + endpoints=None, + use_explicit_padding=False): + """Separable mobilenet V1 style convolution. + + Depthwise convolution, with default non-linearity, + followed by 1x1 depthwise convolution. This is similar to + slim.separable_conv2d, but differs in tha it applies batch + normalization and non-linearity to depthwise. This matches + the basic building of Mobilenet Paper + (https://arxiv.org/abs/1704.04861) + + Args: + input_tensor: input + num_outputs: number of outputs + scope: optional name of the scope. Note if provided it will use + scope_depthwise for deptwhise, and scope_pointwise for pointwise. + normalizer_fn: which normalizer function to use for depthwise/pointwise + stride: stride + rate: output rate (also known as dilation rate) + endpoints: optional, if provided, will export additional tensors to it. + use_explicit_padding: Use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + + Returns: + output tesnor + """ + + with _v1_compatible_scope_naming(scope) as scope: + dw_scope = scope + 'depthwise' + endpoints = endpoints if endpoints is not None else {} + kernel_size = [3, 3] + padding = 'SAME' + if use_explicit_padding: + padding = 'VALID' + input_tensor = _fixed_padding(input_tensor, kernel_size, rate) + net = slim.separable_conv2d( + input_tensor, + None, + kernel_size, + depth_multiplier=1, + stride=stride, + rate=rate, + normalizer_fn=normalizer_fn, + padding=padding, + scope=dw_scope) + + endpoints[dw_scope] = net + + pw_scope = scope + 'pointwise' + net = slim.conv2d( + net, + num_outputs, [1, 1], + stride=1, + normalizer_fn=normalizer_fn, + scope=pw_scope) + endpoints[pw_scope] = net + return net + + +def expand_input_by_factor(n, divisible_by=8): + return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by) + + +@slim.add_arg_scope +def expanded_conv(input_tensor, + num_outputs, + expansion_size=expand_input_by_factor(6), + stride=1, + rate=1, + kernel_size=(3, 3), + residual=True, + normalizer_fn=None, + project_activation_fn=tf.identity, + split_projection=1, + split_expansion=1, + expansion_transform=None, + depthwise_location='expansion', + depthwise_channel_multiplier=1, + endpoints=None, + use_explicit_padding=False, + padding='SAME', + scope=None): + """Depthwise Convolution Block with expansion. + + Builds a composite convolution that has the following structure + expansion (1x1) -> depthwise (kernel_size) -> projection (1x1) + + Args: + input_tensor: input + num_outputs: number of outputs in the final layer. + expansion_size: the size of expansion, could be a constant or a callable. + If latter it will be provided 'num_inputs' as an input. For forward + compatibility it should accept arbitrary keyword arguments. + Default will expand the input by factor of 6. + stride: depthwise stride + rate: depthwise rate + kernel_size: depthwise kernel + residual: whether to include residual connection between input + and output. + normalizer_fn: batchnorm or otherwise + project_activation_fn: activation function for the project layer + split_projection: how many ways to split projection operator + (that is conv expansion->bottleneck) + split_expansion: how many ways to split expansion op + (that is conv bottleneck->expansion) ops will keep depth divisible + by this value. + expansion_transform: Optional function that takes expansion + as a single input and returns output. + depthwise_location: where to put depthwise covnvolutions supported + values None, 'input', 'output', 'expansion' + depthwise_channel_multiplier: depthwise channel multiplier: + each input will replicated (with different filters) + that many times. So if input had c channels, + output will have c x depthwise_channel_multpilier. + endpoints: An optional dictionary into which intermediate endpoints are + placed. The keys "expansion_output", "depthwise_output", + "projection_output" and "expansion_transform" are always populated, even + if the corresponding functions are not invoked. + use_explicit_padding: Use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + padding: Padding type to use if `use_explicit_padding` is not set. + scope: optional scope. + + Returns: + Tensor of depth num_outputs + + Raises: + TypeError: on inval + """ + with tf.variable_scope(scope, default_name='expanded_conv') as s, \ + tf.name_scope(s.original_name_scope): + prev_depth = input_tensor.get_shape().as_list()[3] + if depthwise_location not in [None, 'input', 'output', 'expansion']: + raise TypeError('%r is unknown value for depthwise_location' % + depthwise_location) + if use_explicit_padding: + if padding != 'SAME': + raise TypeError('`use_explicit_padding` should only be used with ' + '"SAME" padding.') + padding = 'VALID' + depthwise_func = functools.partial( + slim.separable_conv2d, + num_outputs=None, + kernel_size=kernel_size, + depth_multiplier=depthwise_channel_multiplier, + stride=stride, + rate=rate, + normalizer_fn=normalizer_fn, + padding=padding, + scope='depthwise') + # b1 -> b2 * r -> b2 + # i -> (o * r) (bottleneck) -> o + input_tensor = tf.identity(input_tensor, 'input') + net = input_tensor + + if depthwise_location == 'input': + if use_explicit_padding: + net = _fixed_padding(net, kernel_size, rate) + net = depthwise_func(net, activation_fn=None) + + if callable(expansion_size): + inner_size = expansion_size(num_inputs=prev_depth) + else: + inner_size = expansion_size + + if inner_size > net.shape[3]: + net = split_conv( + net, + inner_size, + num_ways=split_expansion, + scope='expand', + stride=1, + normalizer_fn=normalizer_fn) + net = tf.identity(net, 'expansion_output') + if endpoints is not None: + endpoints['expansion_output'] = net + + if depthwise_location == 'expansion': + if use_explicit_padding: + net = _fixed_padding(net, kernel_size, rate) + net = depthwise_func(net) + + net = tf.identity(net, name='depthwise_output') + if endpoints is not None: + endpoints['depthwise_output'] = net + if expansion_transform: + net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor) + # Note in contrast with expansion, we always have + # projection to produce the desired output size. + net = split_conv( + net, + num_outputs, + num_ways=split_projection, + stride=1, + scope='project', + normalizer_fn=normalizer_fn, + activation_fn=project_activation_fn) + if endpoints is not None: + endpoints['projection_output'] = net + if depthwise_location == 'output': + if use_explicit_padding: + net = _fixed_padding(net, kernel_size, rate) + net = depthwise_func(net, activation_fn=None) + + if callable(residual): # custom residual + net = residual(input_tensor=input_tensor, output_tensor=net) + elif (residual and + # stride check enforces that we don't add residuals when spatial + # dimensions are None + stride == 1 and + # Depth matches + net.get_shape().as_list()[3] == + input_tensor.get_shape().as_list()[3]): + net += input_tensor + return tf.identity(net, name='output') + + +def split_conv(input_tensor, + num_outputs, + num_ways, + scope, + divisible_by=8, + **kwargs): + """Creates a split convolution. + + Split convolution splits the input and output into + 'num_blocks' blocks of approximately the same size each, + and only connects $i$-th input to $i$ output. + + Args: + input_tensor: input tensor + num_outputs: number of output filters + num_ways: num blocks to split by. + scope: scope for all the operators. + divisible_by: make sure that every part is divisiable by this. + **kwargs: will be passed directly into conv2d operator + Returns: + tensor + """ + b = input_tensor.get_shape().as_list()[3] + + if num_ways == 1 or min(b // num_ways, + num_outputs // num_ways) < divisible_by: + # Don't do any splitting if we end up with less than 8 filters + # on either side. + return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs) + + outs = [] + input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by) + output_splits = _split_divisible( + num_outputs, num_ways, divisible_by=divisible_by) + inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope) + base = scope + for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)): + scope = base + '_part_%d' % (i,) + n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs) + n = tf.identity(n, scope + '_output') + outs.append(n) + return tf.concat(outs, 3, name=scope + '_concat') diff --git a/dh_segment/network/pretrained_models/mobilenet/encoder.py b/dh_segment/network/pretrained_models/mobilenet/encoder.py new file mode 100644 index 0000000..4516b98 --- /dev/null +++ b/dh_segment/network/pretrained_models/mobilenet/encoder.py @@ -0,0 +1,53 @@ +from ...model import Encoder +import tensorflow as tf +from .mobilenet_v2 import training_scope, mobilenet_base +from typing import Tuple, Optional, Union, List, Dict +from tensorflow.contrib import slim +import os +from ....utils.misc import get_data_folder, download_file +import tarfile + + +class MobileNetV2(Encoder): + def __init__(self, train_batchnorm: bool=False, weight_decay: float=0.00004, batch_renorm: bool=True): + self.train_batchnorm = train_batchnorm + self.weight_decay = weight_decay + self.batch_renorm = batch_renorm + pretrained_dir = os.path.join(get_data_folder(), 'mobilenet_v2') + self.pretrained_file = os.path.join(pretrained_dir, 'mobilenet_v2_1.0_224.ckpt') + if not os.path.exists(self.pretrained_file+'.index'): + print("Could not find pre-trained file {}, downloading it!".format(self.pretrained_file)) + tar_filename = os.path.join(get_data_folder(), 'resnet_v1_50.tar.gz') + download_file('https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz', tar_filename) + tar = tarfile.open(tar_filename) + tar.extractall(path=pretrained_dir) + tar.close() + os.remove(tar_filename) + assert os.path.exists(self.pretrained_file+'.index') + print('Pre-trained weights downloaded!') + + def __call__(self, images: tf.Tensor, is_training=False) -> List[tf.Tensor]: + outputs = [] + + with slim.arg_scope(training_scope(weight_decay=self.weight_decay, + is_training=is_training and self.train_batchnorm)): + normalized_images = (images / 127.5) - 1.0 + outputs.append(normalized_images) + + desired_endpoints = [ + 'layer_2', + 'layer_4', + 'layer_7', + 'layer_14', + 'layer_18' + ] + + _, endpoints = mobilenet_base(normalized_images) + for d in desired_endpoints: + outputs.append(endpoints[d]) + + return outputs + + def pretrained_information(self) -> Tuple[Optional[str], Union[None, List, Dict]]: + return self.pretrained_file, [v for v in tf.global_variables() + if 'MobilenetV2' in v.name and 'renorm' not in v.name] \ No newline at end of file diff --git a/dh_segment/network/pretrained_models/mobilenet/mobilenet.py b/dh_segment/network/pretrained_models/mobilenet/mobilenet.py new file mode 100644 index 0000000..8c47dd9 --- /dev/null +++ b/dh_segment/network/pretrained_models/mobilenet/mobilenet.py @@ -0,0 +1,466 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Mobilenet Base Class.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import collections +import contextlib +import copy +import os + +import tensorflow as tf + +slim = tf.contrib.slim + + +@slim.add_arg_scope +def apply_activation(x, name=None, activation_fn=None): + return activation_fn(x, name=name) if activation_fn else x + + +def _fixed_padding(inputs, kernel_size, rate=1): + """Pads the input along the spatial dimensions independently of input size. + + Pads the input such that if it was used in a convolution with 'VALID' padding, + the output would have the same dimensions as if the unpadded input was used + in a convolution with 'SAME' padding. + + Args: + inputs: A tensor of size [batch, height_in, width_in, channels]. + kernel_size: The kernel to be used in the conv2d or max_pool2d operation. + rate: An integer, rate for atrous convolution. + + Returns: + output: A tensor of size [batch, height_out, width_out, channels] with the + input, either intact (if kernel_size == 1) or padded (if kernel_size > 1). + """ + kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), + kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)] + pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] + pad_beg = [pad_total[0] // 2, pad_total[1] // 2] + pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], + [pad_beg[1], pad_end[1]], [0, 0]]) + return padded_inputs + + +def _make_divisible(v, divisor, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +@contextlib.contextmanager +def _set_arg_scope_defaults(defaults): + """Sets arg scope defaults for all items present in defaults. + + Args: + defaults: dictionary/list of pairs, containing a mapping from + function to a dictionary of default args. + + Yields: + context manager where all defaults are set. + """ + if hasattr(defaults, 'items'): + items = list(defaults.items()) + else: + items = defaults + if not items: + yield + else: + func, default_arg = items[0] + with slim.arg_scope(func, **default_arg): + with _set_arg_scope_defaults(items[1:]): + yield + + +@slim.add_arg_scope +def depth_multiplier(output_params, + multiplier, + divisible_by=8, + min_depth=8, + **unused_kwargs): + if 'num_outputs' not in output_params: + return + d = output_params['num_outputs'] + output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by, + min_depth) + + +_Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func']) + + +def op(opfunc, **params): + multiplier = params.pop('multiplier_transorm', depth_multiplier) + return _Op(opfunc, params=params, multiplier_func=multiplier) + + +class NoOpScope(object): + """No-op context manager.""" + + def __enter__(self): + return None + + def __exit__(self, exc_type, exc_value, traceback): + return False + + +def safe_arg_scope(funcs, **kwargs): + """Returns `slim.arg_scope` with all None arguments removed. + + Arguments: + funcs: Functions to pass to `arg_scope`. + **kwargs: Arguments to pass to `arg_scope`. + + Returns: + arg_scope or No-op context manager. + + Note: can be useful if None value should be interpreted as "do not overwrite + this parameter value". + """ + filtered_args = {name: value for name, value in kwargs.items() + if value is not None} + if filtered_args: + return slim.arg_scope(funcs, **filtered_args) + else: + return NoOpScope() + + +@slim.add_arg_scope +def mobilenet_base( # pylint: disable=invalid-name + inputs, + conv_defs, + multiplier=1.0, + final_endpoint=None, + output_stride=None, + use_explicit_padding=False, + scope=None, + is_training=False): + """Mobilenet base network. + + Constructs a network from inputs to the given final endpoint. By default + the network is constructed in inference mode. To create network + in training mode use: + + with slim.arg_scope(mobilenet.training_scope()): + logits, endpoints = mobilenet_base(...) + + Args: + inputs: a tensor of shape [batch_size, height, width, channels]. + conv_defs: A list of op(...) layers specifying the net architecture. + multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + final_endpoint: The name of last layer, for early termination for + for V1-based networks: last layer is "layer_14", for V2: "layer_20" + output_stride: An integer that specifies the requested ratio of input to + output spatial resolution. If not None, then we invoke atrous convolution + if necessary to prevent the network from reducing the spatial resolution + of the activation maps. Allowed values are 1 or any even number, excluding + zero. Typical values are 8 (accurate fully convolutional mode), 16 + (fast fully convolutional mode), and 32 (classification mode). + + NOTE- output_stride relies on all consequent operators to support dilated + operators via "rate" parameter. This might require wrapping non-conv + operators to operate properly. + + use_explicit_padding: Use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + scope: optional variable scope. + is_training: How to setup batch_norm and other ops. Note: most of the time + this does not need be set directly. Use mobilenet.training_scope() to set + up training instead. This parameter is here for backward compatibility + only. It is safe to set it to the value matching + training_scope(is_training=...). It is also safe to explicitly set + it to False, even if there is outer training_scope set to to training. + (The network will be built in inference mode). If this is set to None, + no arg_scope is added for slim.batch_norm's is_training parameter. + + Returns: + tensor_out: output tensor. + end_points: a set of activations for external use, for example summaries or + losses. + + Raises: + ValueError: depth_multiplier <= 0, or the target output_stride is not + allowed. + """ + if multiplier <= 0: + raise ValueError('multiplier is not greater than zero.') + + # Set conv defs defaults and overrides. + conv_defs_defaults = conv_defs.get('defaults', {}) + conv_defs_overrides = conv_defs.get('overrides', {}) + if use_explicit_padding: + conv_defs_overrides = copy.deepcopy(conv_defs_overrides) + conv_defs_overrides[ + (slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'} + + if output_stride is not None: + if output_stride == 0 or (output_stride > 1 and output_stride % 2): + raise ValueError('Output stride must be None, 1 or a multiple of 2.') + + # a) Set the tensorflow scope + # b) set padding to default: note we might consider removing this + # since it is also set by mobilenet_scope + # c) set all defaults + # d) set all extra overrides. + with _scope_all(scope, default_scope='Mobilenet'), \ + safe_arg_scope([slim.batch_norm], is_training=is_training), \ + _set_arg_scope_defaults(conv_defs_defaults), \ + _set_arg_scope_defaults(conv_defs_overrides): + # The current_stride variable keeps track of the output stride of the + # activations, i.e., the running product of convolution strides up to the + # current network layer. This allows us to invoke atrous convolution + # whenever applying the next convolution would result in the activations + # having output stride larger than the target output_stride. + current_stride = 1 + + # The atrous convolution rate parameter. + rate = 1 + + net = inputs + # Insert default parameters before the base scope which includes + # any custom overrides set in mobilenet. + end_points = {} + scopes = {} + for i, opdef in enumerate(conv_defs['spec']): + params = dict(opdef.params) + opdef.multiplier_func(params, multiplier) + stride = params.get('stride', 1) + if output_stride is not None and current_stride == output_stride: + # If we have reached the target output_stride, then we need to employ + # atrous convolution with stride=1 and multiply the atrous rate by the + # current unit's stride for use in subsequent layers. + layer_stride = 1 + layer_rate = rate + rate *= stride + else: + layer_stride = stride + layer_rate = 1 + current_stride *= stride + # Update params. + params['stride'] = layer_stride + # Only insert rate to params if rate > 1. + if layer_rate > 1: + params['rate'] = layer_rate + # Set padding + if use_explicit_padding: + if 'kernel_size' in params: + net = _fixed_padding(net, params['kernel_size'], layer_rate) + else: + params['use_explicit_padding'] = True + + end_point = 'layer_%d' % (i + 1) + try: + net = opdef.op(net, **params) + except Exception: + print('Failed to create op %i: %r params: %r' % (i, opdef, params)) + raise + end_points[end_point] = net + scope = os.path.dirname(net.name) + scopes[scope] = end_point + if final_endpoint is not None and end_point == final_endpoint: + break + + # Add all tensors that end with 'output' to + # endpoints + for t in net.graph.get_operations(): + scope = os.path.dirname(t.name) + bn = os.path.basename(t.name) + if scope in scopes and t.name.endswith('output'): + end_points[scopes[scope] + '/' + bn] = t.outputs[0] + return net, end_points + + +@contextlib.contextmanager +def _scope_all(scope, default_scope=None): + with tf.variable_scope(scope, default_name=default_scope) as s, \ + tf.name_scope(s.original_name_scope): + yield s + + +@slim.add_arg_scope +def mobilenet(inputs, + num_classes=1001, + prediction_fn=slim.softmax, + reuse=None, + scope='Mobilenet', + base_only=False, + **mobilenet_args): + """Mobilenet model for classification, supports both V1 and V2. + + Note: default mode is inference, use mobilenet.training_scope to create + training network. + + + Args: + inputs: a tensor of shape [batch_size, height, width, channels]. + num_classes: number of predicted classes. If 0 or None, the logits layer + is omitted and the input features to the logits layer (before dropout) + are returned instead. + prediction_fn: a function to get predictions out of logits + (default softmax). + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + base_only: if True will only create the base of the network (no pooling + and no logits). + **mobilenet_args: passed to mobilenet_base verbatim. + - conv_defs: list of conv defs + - multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + - output_stride: will ensure that the last layer has at most total stride. + If the architecture calls for more stride than that provided + (e.g. output_stride=16, but the architecture has 5 stride=2 operators), + it will replace output_stride with fractional convolutions using Atrous + Convolutions. + + Returns: + logits: the pre-softmax activations, a tensor of size + [batch_size, num_classes] + end_points: a dictionary from components of the network to the corresponding + activation tensor. + + Raises: + ValueError: Input rank is invalid. + """ + is_training = mobilenet_args.get('is_training', False) + input_shape = inputs.get_shape().as_list() + if len(input_shape) != 4: + raise ValueError('Expected rank 4 input, was: %d' % len(input_shape)) + + with tf.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope: + inputs = tf.identity(inputs, 'input') + net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args) + if base_only: + return net, end_points + + net = tf.identity(net, name='embedding') + + with tf.variable_scope('Logits'): + net = global_pool(net) + end_points['global_pool'] = net + if not num_classes: + return net, end_points + net = slim.dropout(net, scope='Dropout', is_training=is_training) + # 1 x 1 x num_classes + # Note: legacy scope name. + logits = slim.conv2d( + net, + num_classes, [1, 1], + activation_fn=None, + normalizer_fn=None, + biases_initializer=tf.zeros_initializer(), + scope='Conv2d_1c_1x1') + + logits = tf.squeeze(logits, [1, 2]) + + logits = tf.identity(logits, name='output') + end_points['Logits'] = logits + if prediction_fn: + end_points['Predictions'] = prediction_fn(logits, 'Predictions') + return logits, end_points + + +def global_pool(input_tensor, pool_op=tf.nn.avg_pool): + """Applies avg pool to produce 1x1 output. + + NOTE: This function is funcitonally equivalenet to reduce_mean, but it has + baked in average pool which has better support across hardware. + + Args: + input_tensor: input tensor + pool_op: pooling op (avg pool is default) + Returns: + a tensor batch_size x 1 x 1 x depth. + """ + shape = input_tensor.get_shape().as_list() + if shape[1] is None or shape[2] is None: + kernel_size = tf.convert_to_tensor( + [1, tf.shape(input_tensor)[1], + tf.shape(input_tensor)[2], 1]) + else: + kernel_size = [1, shape[1], shape[2], 1] + output = pool_op( + input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID') + # Recover output shape, for unknown shape. + output.set_shape([None, 1, 1, None]) + return output + + +def training_scope(is_training=True, + weight_decay=0.00004, + stddev=0.09, + dropout_keep_prob=0.8, + bn_decay=0.997): + """Defines Mobilenet training scope. + + Usage: + with tf.contrib.slim.arg_scope(mobilenet.training_scope()): + logits, endpoints = mobilenet_v2.mobilenet(input_tensor) + + # the network created will be trainble with dropout/batch norm + # initialized appropriately. + Args: + is_training: if set to False this will ensure that all customizations are + set to non-training mode. This might be helpful for code that is reused + across both training/evaluation, but most of the time training_scope with + value False is not needed. If this is set to None, the parameters is not + added to the batch_norm arg_scope. + + weight_decay: The weight decay to use for regularizing the model. + stddev: Standard deviation for initialization, if negative uses xavier. + dropout_keep_prob: dropout keep probability (not set if equals to None). + bn_decay: decay for the batch norm moving averages (not set if equals to + None). + + Returns: + An argument scope to use via arg_scope. + """ + # Note: do not introduce parameters that would change the inference + # model here (for example whether to use bias), modify conv_def instead. + batch_norm_params = { + 'decay': bn_decay, + 'is_training': is_training + } + if stddev < 0: + weight_intitializer = slim.initializers.xavier_initializer() + else: + weight_intitializer = tf.truncated_normal_initializer(stddev=stddev) + + # Set weight_decay for weights in Conv and FC layers. + with slim.arg_scope( + [slim.conv2d, slim.fully_connected, slim.separable_conv2d], + weights_initializer=weight_intitializer, + normalizer_fn=slim.batch_norm), \ + slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training), \ + safe_arg_scope([slim.batch_norm], **batch_norm_params), \ + safe_arg_scope([slim.dropout], is_training=is_training, + keep_prob=dropout_keep_prob), \ + slim.arg_scope([slim.conv2d], \ + weights_regularizer=slim.l2_regularizer(weight_decay)), \ + slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s: + return s diff --git a/dh_segment/network/pretrained_models/mobilenet/mobilenet_v2.py b/dh_segment/network/pretrained_models/mobilenet/mobilenet_v2.py new file mode 100644 index 0000000..f2df180 --- /dev/null +++ b/dh_segment/network/pretrained_models/mobilenet/mobilenet_v2.py @@ -0,0 +1,219 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of Mobilenet V2. + +Architecture: https://arxiv.org/abs/1801.04381 + +The base model gives 72.2% accuracy on ImageNet, with 300MMadds, +3.4 M parameters. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import functools + +import tensorflow as tf + +from . import conv_blocks as ops +from . import mobilenet as lib + +slim = tf.contrib.slim +op = lib.op + +expand_input = ops.expand_input_by_factor + +# pyformat: disable +# Architecture: https://arxiv.org/abs/1801.04381 +V2_DEF = dict( + defaults={ + # Note: these parameters of batch norm affect the architecture + # that's why they are here and not in training_scope. + (slim.batch_norm,): {'center': True, 'scale': True}, + (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { + 'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6 + }, + (ops.expanded_conv,): { + 'expansion_size': expand_input(6), + 'split_expansion': 1, + 'normalizer_fn': slim.batch_norm, + 'residual': True + }, + (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME'} + }, + spec=[ + op(slim.conv2d, stride=2, num_outputs=32, kernel_size=[3, 3]), + op(ops.expanded_conv, + expansion_size=expand_input(1, divisible_by=1), + num_outputs=16), + op(ops.expanded_conv, stride=2, num_outputs=24), + op(ops.expanded_conv, stride=1, num_outputs=24), + op(ops.expanded_conv, stride=2, num_outputs=32), + op(ops.expanded_conv, stride=1, num_outputs=32), + op(ops.expanded_conv, stride=1, num_outputs=32), + op(ops.expanded_conv, stride=2, num_outputs=64), + op(ops.expanded_conv, stride=1, num_outputs=64), + op(ops.expanded_conv, stride=1, num_outputs=64), + op(ops.expanded_conv, stride=1, num_outputs=64), + op(ops.expanded_conv, stride=1, num_outputs=96), + op(ops.expanded_conv, stride=1, num_outputs=96), + op(ops.expanded_conv, stride=1, num_outputs=96), + op(ops.expanded_conv, stride=2, num_outputs=160), + op(ops.expanded_conv, stride=1, num_outputs=160), + op(ops.expanded_conv, stride=1, num_outputs=160), + op(ops.expanded_conv, stride=1, num_outputs=320), + op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280) + ], +) + + +# pyformat: enable + + +@slim.add_arg_scope +def mobilenet(input_tensor, + num_classes=1001, + depth_multiplier=1.0, + scope='MobilenetV2', + conv_defs=None, + finegrain_classification_mode=False, + min_depth=None, + divisible_by=None, + activation_fn=None, + **kwargs): + """Creates mobilenet V2 network. + + Inference mode is created by default. To create training use training_scope + below. + + with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): + logits, endpoints = mobilenet_v2.mobilenet(input_tensor) + + Args: + input_tensor: The input tensor + num_classes: number of classes + depth_multiplier: The multiplier applied to scale number of + channels in each layer. Note: this is called depth multiplier in the + paper but the name is kept for consistency with slim's model builder. + scope: Scope of the operator + conv_defs: Allows to override default conv def. + finegrain_classification_mode: When set to True, the model + will keep the last layer large even for small multipliers. Following + https://arxiv.org/abs/1801.04381 + suggests that it improves performance for ImageNet-type of problems. + *Note* ignored if final_endpoint makes the builder exit earlier. + min_depth: If provided, will ensure that all layers will have that + many channels after application of depth multiplier. + divisible_by: If provided will ensure that all layers # channels + will be divisible by this number. + activation_fn: Activation function to use, defaults to tf.nn.relu6 if not + specified. + **kwargs: passed directly to mobilenet.mobilenet: + prediction_fn- what prediction function to use. + reuse-: whether to reuse variables (if reuse set to true, scope + must be given). + Returns: + logits/endpoints pair + + Raises: + ValueError: On invalid arguments + """ + if conv_defs is None: + conv_defs = V2_DEF + if 'multiplier' in kwargs: + raise ValueError('mobilenetv2 doesn\'t support generic ' + 'multiplier parameter use "depth_multiplier" instead.') + if finegrain_classification_mode: + conv_defs = copy.deepcopy(conv_defs) + if depth_multiplier < 1: + conv_defs['spec'][-1].params['num_outputs'] /= depth_multiplier + if activation_fn: + conv_defs = copy.deepcopy(conv_defs) + defaults = conv_defs['defaults'] + conv_defaults = ( + defaults[(slim.conv2d, slim.fully_connected, slim.separable_conv2d)]) + conv_defaults['activation_fn'] = activation_fn + + depth_args = {} + # NB: do not set depth_args unless they are provided to avoid overriding + # whatever default depth_multiplier might have thanks to arg_scope. + if min_depth is not None: + depth_args['min_depth'] = min_depth + if divisible_by is not None: + depth_args['divisible_by'] = divisible_by + + with slim.arg_scope((lib.depth_multiplier,), **depth_args): + return lib.mobilenet( + input_tensor, + num_classes=num_classes, + conv_defs=conv_defs, + scope=scope, + multiplier=depth_multiplier, + **kwargs) + + +mobilenet.default_image_size = 224 + + +def wrapped_partial(func, *args, **kwargs): + partial_func = functools.partial(func, *args, **kwargs) + functools.update_wrapper(partial_func, func) + return partial_func + + +# Wrappers for mobilenet v2 with depth-multipliers. Be noticed that +# 'finegrain_classification_mode' is set to True, which means the embedding +# layer will not be shrinked when given a depth-multiplier < 1.0. +mobilenet_v2_140 = wrapped_partial(mobilenet, depth_multiplier=1.4) +mobilenet_v2_050 = wrapped_partial(mobilenet, depth_multiplier=0.50, + finegrain_classification_mode=True) +mobilenet_v2_035 = wrapped_partial(mobilenet, depth_multiplier=0.35, + finegrain_classification_mode=True) + + +@slim.add_arg_scope +def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs): + """Creates base of the mobilenet (no pooling and no logits) .""" + return mobilenet(input_tensor, + depth_multiplier=depth_multiplier, + base_only=True, **kwargs) + + +def training_scope(**kwargs): + """Defines MobilenetV2 training scope. + + Usage: + with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): + logits, endpoints = mobilenet_v2.mobilenet(input_tensor) + + with slim. + + Args: + **kwargs: Passed to mobilenet.training_scope. The following parameters + are supported: + weight_decay- The weight decay to use for regularizing the model. + stddev- Standard deviation for initialization, if negative uses xavier. + dropout_keep_prob- dropout keep probability + bn_decay- decay for the batch norm moving averages. + + Returns: + An `arg_scope` to use for the mobilenet v2 model. + """ + return lib.training_scope(**kwargs) + + +__all__ = ['training_scope', 'mobilenet_base', 'mobilenet', 'V2_DEF'] diff --git a/dh_segment/network/pretrained_models/resnet50.py b/dh_segment/network/pretrained_models/resnet50.py new file mode 100644 index 0000000..f4de0bb --- /dev/null +++ b/dh_segment/network/pretrained_models/resnet50.py @@ -0,0 +1,120 @@ +from tensorflow.contrib import slim, layers +import tensorflow as tf +from tensorflow.contrib.slim import nets +from ..model import Encoder +import os +import tarfile +from ...utils.misc import get_data_folder, download_file +from .vgg16 import mean_substraction + + +class ResnetV1_50(Encoder): + """ResNet-50 implementation + + :ivar train_batchnorm: Option to use batch norm + :ivar blocks: number of blocks (resnet blocks) + :ivar weight_decay: value of weight decay + :ivar batch_renorm: Option to use batch renorm + :ivar corrected_version: option to use the original resnet implementation (True) but less efficient than \ + `slim`'s implementation + :ivar pretrained_file: path to the file (.ckpt) containing the pretrained weights + """ + def __init__(self, train_batchnorm: bool=False, blocks: int=4, weight_decay: float=0.0001, + batch_renorm: bool=True, corrected_version: bool=False): + self.train_batchnorm = train_batchnorm + self.blocks = blocks + self.weight_decay = weight_decay + self.batch_renorm = batch_renorm + self.corrected_version = corrected_version + self.pretrained_file = os.path.join(get_data_folder(), 'resnet_v1_50.ckpt') + if not os.path.exists(self.pretrained_file): + print("Could not find pre-trained file {}, downloading it!".format(self.pretrained_file)) + tar_filename = os.path.join(get_data_folder(), 'resnet_v1_50.tar.gz') + download_file('http://download.tensorflow.org/models/resnet_v1_50_2016_08_28.tar.gz', tar_filename) + tar = tarfile.open(tar_filename) + tar.extractall(path=get_data_folder()) + tar.close() + os.remove(tar_filename) + assert os.path.exists(self.pretrained_file) + print('Pre-trained weights downloaded!') + + def pretrained_information(self): + return self.pretrained_file, [v for v in tf.global_variables() + if 'resnet_v1_50' in v.name + and 'renorm' not in v.name] + + def __call__(self, images: tf.Tensor, is_training=False): + outputs = [] + + with slim.arg_scope(nets.resnet_v1.resnet_arg_scope(weight_decay=self.weight_decay, batch_norm_decay=0.999)), \ + slim.arg_scope([layers.batch_norm], renorm_decay=0.95, renorm=self.batch_renorm): + mean_substracted_tensor = mean_substraction(images) + assert 0 < self.blocks <= 4 + + if self.corrected_version: + def corrected_resnet_v1_block(scope: str, base_depth: int, num_units: int, stride: int) -> tf.Tensor: + """ + Helper function for creating a resnet_v1 bottleneck block. + + :param scope: The scope of the block. + :param base_depth: The depth of the bottleneck layer for each unit. + :param num_units: The number of units in the block. + :param stride: The stride of the block, implemented as a stride in the last unit. + All other units have stride=1. + :return: A resnet_v1 bottleneck block. + """ + return nets.resnet_utils.Block(scope, nets.resnet_v1.bottleneck, [{ + 'depth': base_depth * 4, + 'depth_bottleneck': base_depth, + 'stride': stride + }] + [{ + 'depth': base_depth * 4, + 'depth_bottleneck': base_depth, + 'stride': 1 + }] * (num_units - 1)) + + blocks_list = [ + corrected_resnet_v1_block('block1', base_depth=64, num_units=3, stride=1), + corrected_resnet_v1_block('block2', base_depth=128, num_units=4, stride=2), + corrected_resnet_v1_block('block3', base_depth=256, num_units=6, stride=2), + corrected_resnet_v1_block('block4', base_depth=512, num_units=3, stride=2), + ] + desired_endpoints = [ + 'resnet_v1_50/conv1', + 'resnet_v1_50/block1/unit_3/bottleneck_v1', + 'resnet_v1_50/block2/unit_4/bottleneck_v1', + 'resnet_v1_50/block3/unit_6/bottleneck_v1', + 'resnet_v1_50/block4/unit_3/bottleneck_v1' + ] + else: + blocks_list = [ + nets.resnet_v1.resnet_v1_block('block1', base_depth=64, num_units=3, stride=2), + nets.resnet_v1.resnet_v1_block('block2', base_depth=128, num_units=4, stride=2), + nets.resnet_v1.resnet_v1_block('block3', base_depth=256, num_units=6, stride=2), + nets.resnet_v1.resnet_v1_block('block4', base_depth=512, num_units=3, stride=1), + ] + desired_endpoints = [ + 'resnet_v1_50/conv1', + 'resnet_v1_50/block1/unit_2/bottleneck_v1', + 'resnet_v1_50/block2/unit_3/bottleneck_v1', + 'resnet_v1_50/block3/unit_5/bottleneck_v1', + 'resnet_v1_50/block4/unit_3/bottleneck_v1' + ] + + net, endpoints = nets.resnet_v1.resnet_v1(mean_substracted_tensor, + blocks=blocks_list[:self.blocks], + num_classes=None, + is_training=self.train_batchnorm and is_training, + global_pool=False, + output_stride=None, + include_root_block=True, + reuse=None, + scope='resnet_v1_50') + + # Add standardized original images + outputs.append(mean_substracted_tensor/127.0) + + for d in desired_endpoints[:self.blocks + 1]: + outputs.append(endpoints[d]) + + return outputs diff --git a/dh_segment/network/pretrained_models/vgg16.py b/dh_segment/network/pretrained_models/vgg16.py new file mode 100644 index 0000000..30d5954 --- /dev/null +++ b/dh_segment/network/pretrained_models/vgg16.py @@ -0,0 +1,77 @@ +from tensorflow.contrib import slim, layers +import tensorflow as tf +from tensorflow.contrib.slim import nets +import numpy as np +from ..model import Encoder +import os +import tarfile +from ...utils.misc import get_data_folder, download_file + +_VGG_MEANS = [123.68, 116.78, 103.94] + + +def mean_substraction(input_tensor, means=_VGG_MEANS): + return tf.subtract(input_tensor, np.array(means)[None, None, None, :], name='MeanSubstraction') + + +class VGG16(Encoder): + """VGG-16 implementation + + :ivar blocks: number of blocks (vgg blocks) + :ivar weight_decay: weight decay value + :ivar pretrained_file: path to the file (.ckpt) containing the pretrained weights + """ + def __init__(self, blocks: int=5, weight_decay: float=0.0005): + self.blocks = blocks + self.weight_decay = weight_decay + self.pretrained_file = os.path.join(get_data_folder(), 'vgg_16.ckpt') + if not os.path.exists(self.pretrained_file): + print("Could not find pre-trained file {}, downloading it!".format(self.pretrained_file)) + tar_filename = os.path.join(get_data_folder(), 'vgg_16.tar.gz') + download_file('http://download.tensorflow.org/models/vgg_16_2016_08_28.tar.gz', tar_filename) + tar = tarfile.open(tar_filename) + tar.extractall(path=get_data_folder()) + tar.close() + os.remove(tar_filename) + assert os.path.exists(self.pretrained_file) + print('Pre-trained weights downloaded!') + + def pretrained_information(self): + return self.pretrained_file, [v for v in tf.global_variables() + if 'vgg_16' in v.name + and 'renorm' not in v.name] + + def __call__(self, images: tf.Tensor, is_training=False): + outputs = [] + + with slim.arg_scope(nets.vgg.vgg_arg_scope(weight_decay=self.weight_decay)): + with tf.variable_scope(None, 'vgg_16', [images]) as sc: + input_tensor = mean_substraction(images) + outputs.append(input_tensor) + end_points_collection = sc.original_name_scope + '_end_points' + # Collect outputs for conv2d, fully_connected and max_pool2d. + with slim.arg_scope( + [layers.conv2d, layers.fully_connected, layers.max_pool2d], + outputs_collections=end_points_collection): + net = layers.repeat( + input_tensor, 2, layers.conv2d, 64, [3, 3], scope='conv1') + net = layers.max_pool2d(net, [2, 2], scope='pool1') + outputs.append(net) + if self.blocks >= 2: + net = layers.repeat(net, 2, layers.conv2d, 128, [3, 3], scope='conv2') + net = layers.max_pool2d(net, [2, 2], scope='pool2') + outputs.append(net) + if self.blocks >= 3: + net = layers.repeat(net, 3, layers.conv2d, 256, [3, 3], scope='conv3') + net = layers.max_pool2d(net, [2, 2], scope='pool3') + outputs.append(net) + if self.blocks >= 4: + net = layers.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv4') + net = layers.max_pool2d(net, [2, 2], scope='pool4') + outputs.append(net) + if self.blocks >= 5: + net = layers.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv5') + net = layers.max_pool2d(net, [2, 2], scope='pool5') + outputs.append(net) + + return outputs diff --git a/dh_segment/post_processing/binarization.py b/dh_segment/post_processing/binarization.py index 6f4df98..0345eb5 100644 --- a/dh_segment/post_processing/binarization.py +++ b/dh_segment/post_processing/binarization.py @@ -38,7 +38,7 @@ def cleaning_binary(mask: np.ndarray, kernel_size: int=5) -> np.ndarray: ksize_close = (kernel_size, kernel_size) mask = cv2.morphologyEx((mask.astype(np.uint8, copy=False) * 255), cv2.MORPH_OPEN, kernel=np.ones(ksize_open)) mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel=np.ones(ksize_close)) - return mask / 255 + return np.uint8(mask / 255) def hysteresis_thresholding(probs: np.array, low_threshold: float, high_threshold: float, diff --git a/dh_segment/post_processing/boxes_detection.py b/dh_segment/post_processing/boxes_detection.py index 04ce858..8a12d08 100644 --- a/dh_segment/post_processing/boxes_detection.py +++ b/dh_segment/post_processing/boxes_detection.py @@ -25,7 +25,7 @@ def find_boxes(boxes_mask: np.ndarray, mode: str= 'min_rectangle', min_area: flo assert len(boxes_mask.shape) == 2, \ 'Input mask must be a 2D array ! Mask is now of shape {}'.format(boxes_mask.shape) - _, contours, _ = cv2.findContours(boxes_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours, _ = cv2.findContours(boxes_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) if contours is None: print('No contour found') return None diff --git a/dh_segment/post_processing/polygon_detection.py b/dh_segment/post_processing/polygon_detection.py index 0614612..a970bec 100644 --- a/dh_segment/post_processing/polygon_detection.py +++ b/dh_segment/post_processing/polygon_detection.py @@ -25,6 +25,8 @@ def find_polygonal_regions(image_mask: np.ndarray, min_area: float=0.1, n_max_po found_polygons = list() for c in contours: + if len(c) < 3: # A polygon cannot have less than 3 points + continue polygon = geometry.Polygon([point[0] for point in c]) # Check that polygon has area greater than minimal area if polygon.area >= min_area*np.prod(image_mask.shape[:2]): diff --git a/train.py b/dh_segment/train.py similarity index 76% rename from train.py rename to dh_segment/train.py index 9cf0b44..3211d74 100644 --- a/train.py +++ b/dh_segment/train.py @@ -1,122 +1,127 @@ -import os -import tensorflow as tf -# Tensorflow logging level -from logging import WARNING # import DEBUG, INFO, ERROR for more/less verbosity - -tf.logging.set_verbosity(WARNING) -from dh_segment import estimator_fn, input, utils -import json -from glob import glob -import numpy as np - -try: - import better_exceptions -except ImportError: - print('/!\ W -- Not able to import package better_exceptions') - pass -from tqdm import trange -from sacred import Experiment -import pandas as pd - -ex = Experiment('dhSegment_experiment') - - -@ex.config -def default_config(): - train_data = None # Directory with training data - eval_data = None # Directory with validation data - model_output_dir = None # Directory to output tf model - restore_model = False # Set to true to continue training - classes_file = None # txt file with classes values (unused for REGRESSION) - gpu = '' # GPU to be used for training - prediction_type = utils.PredictionType.CLASSIFICATION # One of CLASSIFICATION, REGRESSION or MULTILABEL - pretrained_model_name = 'resnet50' - model_params = utils.ModelParams(pretrained_model_name=pretrained_model_name).to_dict() # Model parameters - training_params = utils.TrainingParams().to_dict() # Training parameters - if prediction_type == utils.PredictionType.CLASSIFICATION: - assert classes_file is not None - model_params['n_classes'] = utils.get_n_classes_from_file(classes_file) - elif prediction_type == utils.PredictionType.REGRESSION: - model_params['n_classes'] = 1 - elif prediction_type == utils.PredictionType.MULTILABEL: - assert classes_file is not None - model_params['n_classes'] = utils.get_n_classes_from_file_multilabel(classes_file) - - -@ex.automain -def run(train_data, eval_data, model_output_dir, gpu, training_params, _config): - # Create output directory - if not os.path.isdir(model_output_dir): - os.makedirs(model_output_dir) - else: - assert _config.get('restore_model'), \ - '{0} already exists, you cannot use it as output directory. ' \ - 'Set "restore_model=True" to continue training, or delete dir "rm -r {0}"'.format(model_output_dir) - # Save config - with open(os.path.join(model_output_dir, 'config.json'), 'w') as f: - json.dump(_config, f, indent=4, sort_keys=True) - - # Create export directory for saved models - saved_model_dir = os.path.join(model_output_dir, 'export') - if not os.path.isdir(saved_model_dir): - os.makedirs(saved_model_dir) - - training_params = utils.TrainingParams.from_dict(training_params) - - session_config = tf.ConfigProto() - session_config.gpu_options.visible_device_list = str(gpu) - session_config.gpu_options.per_process_gpu_memory_fraction = 0.9 - estimator_config = tf.estimator.RunConfig().replace(session_config=session_config, - save_summary_steps=10, - keep_checkpoint_max=1) - estimator = tf.estimator.Estimator(estimator_fn.model_fn, model_dir=model_output_dir, - params=_config, config=estimator_config) - - def get_dirs_or_files(input_data): - if os.path.isdir(input_data): - train_input, train_labels_input = os.path.join(input_data, 'images'), os.path.join(input_data, 'labels') - # Check if training dir exists - if not os.path.isdir(train_input): - raise FileNotFoundError(train_input) - if not os.path.isdir(train_labels_input): - raise FileNotFoundError(train_labels_input) - elif os.path.isfile(train_data) and train_data.endswith('.csv'): - train_input = train_data - train_labels_input = None - else: - raise TypeError('input_data {} is neither a directory nor a csv file'.format(input_data)) - return train_input, train_labels_input - - train_input, train_labels_input = get_dirs_or_files(train_data) - if eval_data is not None: - eval_input, eval_labels_input = get_dirs_or_files(eval_data) - - # Configure exporter - serving_input_fn = input.serving_input_filename(training_params.input_resized_size) - exporter = tf.estimator.BestExporter(serving_input_receiver_fn=serving_input_fn, exports_to_keep=2) - - for i in trange(0, training_params.n_epochs, training_params.evaluate_every_epoch, desc='Evaluated epochs'): - estimator.train(input.input_fn(train_input, - input_label_dir=train_labels_input, - num_epochs=training_params.evaluate_every_epoch, - batch_size=training_params.batch_size, - data_augmentation=training_params.data_augmentation, - make_patches=training_params.make_patches, - image_summaries=True, - params=_config, - num_threads=32)) - - if eval_data is not None: - eval_result = estimator.evaluate(input.input_fn(eval_input, - input_label_dir=eval_labels_input, - batch_size=1, - data_augmentation=False, - make_patches=False, - image_summaries=False, - params=_config, - num_threads=32)) - else: - eval_result = None - - exporter.export(estimator, saved_model_dir, checkpoint_path=None, eval_result=eval_result, - is_the_final_export=False) +import os +import tensorflow as tf +# Tensorflow logging level +from logging import WARNING # import DEBUG, INFO, ERROR for more/less verbosity + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # or any {'0', '1', '2'} +tf.logging.set_verbosity(WARNING) +from dh_segment import estimator_fn, utils +from dh_segment.io import input +import json +from tqdm import trange +from sacred import Experiment + +ex = Experiment('dhSegment_experiment') + + +@ex.config +def default_config(): + train_data = None # Directory with training data + eval_data = None # Directory with validation data + model_output_dir = None # Directory to output tf model + restore_model = False # Set to true to continue training + classes_file = None # txt file with classes values (unused for REGRESSION) + gpu = '' # GPU to be used for training + prediction_type = utils.PredictionType.CLASSIFICATION # One of CLASSIFICATION, REGRESSION or MULTILABEL + model_params = utils.ModelParams().to_dict() # Model parameters + training_params = utils.TrainingParams().to_dict() # Training parameters + if prediction_type == utils.PredictionType.CLASSIFICATION: + assert classes_file is not None + model_params['n_classes'] = utils.get_n_classes_from_file(classes_file) + elif prediction_type == utils.PredictionType.REGRESSION: + model_params['n_classes'] = 1 + elif prediction_type == utils.PredictionType.MULTILABEL: + assert classes_file is not None + model_params['n_classes'] = utils.get_n_classes_from_file_multilabel(classes_file) + + +@ex.main +def run(train_data, eval_data, model_output_dir, gpu, training_params, _config): + # Create output directory + if not os.path.isdir(model_output_dir): + os.makedirs(model_output_dir) + else: + assert _config.get('restore_model'), \ + '{0} already exists, you cannot use it as output directory. ' \ + 'Set "restore_model=True" to continue training, or delete dir "rm -r {0}"'.format(model_output_dir) + # Save config + with open(os.path.join(model_output_dir, 'config.json'), 'w') as f: + json.dump(_config, f, indent=4, sort_keys=True) + + # Create export directory for saved models + saved_model_dir = os.path.join(model_output_dir, 'export') + if not os.path.isdir(saved_model_dir): + os.makedirs(saved_model_dir) + + training_params = utils.TrainingParams.from_dict(training_params) + + session_config = tf.ConfigProto() + session_config.gpu_options.visible_device_list = str(gpu) + session_config.gpu_options.per_process_gpu_memory_fraction = 0.9 + estimator_config = tf.estimator.RunConfig().replace(session_config=session_config, + save_summary_steps=10, + keep_checkpoint_max=1) + estimator = tf.estimator.Estimator(estimator_fn.model_fn, model_dir=model_output_dir, + params=_config, config=estimator_config) + + def get_dirs_or_files(input_data): + if os.path.isdir(input_data): + image_input, labels_input = os.path.join(input_data, 'images'), os.path.join(input_data, 'labels') + # Check if training dir exists + assert os.path.isdir(image_input), "{} is not a directory".format(image_input) + assert os.path.isdir(labels_input), "{} is not a directory".format(labels_input) + + elif os.path.isfile(input_data) and input_data.endswith('.csv'): + image_input = input_data + labels_input = None + else: + raise TypeError('input_data {} is neither a directory nor a csv file'.format(input_data)) + return image_input, labels_input + + train_input, train_labels_input = get_dirs_or_files(train_data) + if eval_data is not None: + eval_input, eval_labels_input = get_dirs_or_files(eval_data) + + # Configure exporter + serving_input_fn = input.serving_input_filename(training_params.input_resized_size) + exporter = tf.estimator.BestExporter(serving_input_receiver_fn=serving_input_fn, exports_to_keep=2) + + #if eval_data is not None: + # exporter = tf.estimator.BestExporter(serving_input_receiver_fn=serving_input_fn, exports_to_keep=2) + #else: + # exporter = tf.estimator.LatestExporter(name='SimpleExporter', serving_input_receiver_fn=serving_input_fn, + # exports_to_keep=5) + + nb_cores = os.cpu_count() + if nb_cores: + num_threads = min(nb_cores//2, 16) + else: + num_threads = 4 + + for i in trange(0, training_params.n_epochs, training_params.evaluate_every_epoch, desc='Evaluated epochs'): + estimator.train(input.input_fn(train_input, + input_label_dir=train_labels_input, + num_epochs=training_params.evaluate_every_epoch, + batch_size=training_params.batch_size, + data_augmentation=training_params.data_augmentation, + make_patches=training_params.make_patches, + image_summaries=True, + params=_config, + num_threads=num_threads, + progressbar_description="Training".format(i))) + + if eval_data is not None: + eval_result = estimator.evaluate(input.input_fn(eval_input, + input_label_dir=eval_labels_input, + batch_size=1, + data_augmentation=False, + make_patches=False, + image_summaries=False, + params=_config, + num_threads=num_threads, + progressbar_description="Evaluation")) + else: + eval_result = None + + exporter.export(estimator, saved_model_dir, checkpoint_path=None, eval_result=eval_result, + is_the_final_export=False) diff --git a/dh_segment/utils/__init__.py b/dh_segment/utils/__init__.py index 07f9d98..ebafdcf 100644 --- a/dh_segment/utils/__init__.py +++ b/dh_segment/utils/__init__.py @@ -46,9 +46,6 @@ _PARAMSCONFIG = [ 'PredictionType', - 'VGG16ModelParams', - 'ResNetModelParams', - 'UNetModelParams', 'ModelParams', 'TrainingParams' ] diff --git a/dh_segment/utils/labels.py b/dh_segment/utils/labels.py index 2f35ae6..4bb4277 100644 --- a/dh_segment/utils/labels.py +++ b/dh_segment/utils/labels.py @@ -4,6 +4,7 @@ import tensorflow as tf import numpy as np import os +from typing import Tuple def label_image_to_class(label_image: tf.Tensor, classes_file: str) -> tf.Tensor: @@ -29,6 +30,13 @@ def class_to_label_image(class_label: tf.Tensor, classes_file: str) -> tf.Tensor def multilabel_image_to_class(label_image: tf.Tensor, classes_file: str) -> tf.Tensor: + """ + Combines image annotations with classes info of the txt file to create the input label for the training. + + :param label_image: annotated image [H,W,Ch] or [B,H,W,Ch] (Ch = color channels) + :param classes_file: the filename of the txt file containing the class info + :return: [H,W,Cl] or [B,H,W,Cl] (Cl = number of classes) + """ classes_color_values, colors_labels = get_classes_color_from_file_multilabel(classes_file) # Convert label_image [H,W,3] to the classes [H,W,C],int32 according to the classes [C,3] with tf.name_scope('LabelAssign'): @@ -71,7 +79,15 @@ def get_n_classes_from_file(classes_file: str) -> int: return get_classes_color_from_file(classes_file).shape[0] -def get_classes_color_from_file_multilabel(classes_file: str) -> np.ndarray: +def get_classes_color_from_file_multilabel(classes_file: str) -> Tuple[np.ndarray, np.array]: + """ + Get classes and code labels from txt file. + This function deals with the case of elements with multiple labels. + + :param classes_file: file containing the classes (usually named *classes.txt*) + :return: for each class the RGB color (array size [N, 3]); and the label's code (array size [N, C]), + with N the number of combinations and C the number of classes + """ if not os.path.exists(classes_file): raise FileNotFoundError(classes_file) result = np.loadtxt(classes_file).astype(np.float32) diff --git a/dh_segment/utils/misc.py b/dh_segment/utils/misc.py index 5a1b77a..102a21c 100644 --- a/dh_segment/utils/misc.py +++ b/dh_segment/utils/misc.py @@ -5,6 +5,14 @@ import json import pickle from hashlib import sha1 +from typing import Any +import importlib +import os +import urllib.request +import tarfile +import os +from tqdm import tqdm +from random import shuffle def parse_json(filename): @@ -29,3 +37,62 @@ def dump_pickle(filename, obj): def hash_dict(params): return sha1(json.dumps(params, sort_keys=True).encode()).hexdigest() + + +def shuffled(l: list) -> list: + ll = l.copy() + shuffle(ll) + return ll + + +def get_class_from_name(full_class_name: str) -> Any: + """ + Tries to load the class from its naming, will import the corresponding module. + Raises an Error if it does not work. + + :param full_class_name: full name of the class, for instance `foo.bar.Baz` + :return: the loaded class + """ + module_name, class_name = full_class_name.rsplit('.', maxsplit=1) + # load the module, will raise ImportError if module cannot be loaded + m = importlib.import_module(module_name) + # get the class, will raise AttributeError if class cannot be found + c = getattr(m, class_name) + return c + + +def get_data_folder() -> str: + folder = os.path.join(os.path.expanduser('~'), '.dh_segment') + os.makedirs(folder, exist_ok=True) + return folder + + +def download_file(url: str, output_file: str): + """ + + :param url: + :param output_file: + :return: + """ + def progress_hook(t): + last_b = [0] + + def update_to(b=1, bsize=1, tsize=None): + """ + b : int, optional + Number of blocks transferred so far [default: 1]. + bsize : int, optional + Size of each block (in tqdm units) [default: 1]. + tsize : int, optional + Total size (in tqdm units). If [default: None] remains unchanged. + """ + if tsize is not None: + t.total = tsize + t.update((b - last_b[0]) * bsize) + last_b[0] = b + + return update_to + + with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, + desc="Downloading pre-trained weights") as t: + urllib.request.urlretrieve(url, output_file, reporthook=progress_hook(t)) diff --git a/dh_segment/utils/params_config.py b/dh_segment/utils/params_config.py index e926b06..6ba245f 100644 --- a/dh_segment/utils/params_config.py +++ b/dh_segment/utils/params_config.py @@ -2,9 +2,9 @@ __author__ = "solivr" __license__ = "GPL" -import os -import warnings -from random import shuffle +from .misc import get_class_from_name +from ..network.model import Encoder, Decoder +from typing import Type, Optional class PredictionType: @@ -19,7 +19,7 @@ class PredictionType: MULTILABEL = 'MULTILABEL' @classmethod - def parse(cls, prediction_type): + def parse(cls, prediction_type) -> 'PredictionType': if prediction_type == 'CLASSIFICATION': return PredictionType.CLASSIFICATION elif prediction_type == 'REGRESSION': @@ -48,104 +48,41 @@ def check_params(self): pass -class VGG16ModelParams: - PRETRAINED_MODEL_FILE = 'pretrained_models/vgg_16.ckpt' - INTERMEDIATE_CONV = [ - [(256, 3)] - ] - UPSCALE_PARAMS = [ - [(32, 3)], - [(64, 3)], - [(128, 3)], - [(256, 3)], - [(512, 3)], - [(512, 3)] - ] - SELECTED_LAYERS_UPSCALING = [ - True, - True, # Must have same length as vgg_upscale_params - True, - True, - False, - False - ] - CORRECTED_VERSION = None - - -class ResNetModelParams: - PRETRAINED_MODEL_FILE = 'pretrained_models/resnet_v1_50.ckpt' - INTERMEDIATE_CONV = None - UPSCALE_PARAMS = [ - # (Filter size (depth bottleneck's output), number of bottleneck) - (32, 0), - (64, 0), - (128, 0), - (256, 0), - (512, 0) - ] - SELECTED_LAYERS_UPSCALING = [ - # Must have the same length as resnet_upscale_params - True, - True, - True, - True, - True - ] - CORRECT_VERSION = False - - -class UNetModelParams: - PRETRAINED_MODEL_FILE = None - INTERMEDIATE_CONV = None - UPSCALE_PARAMS = None - SELECTED_LAYERS_UPSCALING = None - CORRECT_VERSION = False - - class ModelParams(BaseParams): - """Parameters related to the model - + """ + Parameters related to the model + :param encoder_name: + :param encoder_params: + :param decoder_name: + :param decoder_params: + :param n_classes: """ def __init__(self, **kwargs): - self.batch_norm = kwargs.get('batch_norm', True) # type: bool - self.batch_renorm = kwargs.get('batch_renorm', True) # type: bool - self.weight_decay = kwargs.get('weight_decay', 1e-6) # type: float + self.encoder_network = kwargs.get('encoder_network', 'dh_segment.network.pretrained_models.ResnetV1_50') # type: str + self.encoder_network_params = kwargs.get('encoder_network_params', dict()) # type: dict + self.decoder_network = kwargs.get('decoder_network', 'dh_segment.network.SimpleDecoder') # type: str + self.decoder_network_params = kwargs.get('decoder_network_params', { + 'upsampling_dims': [32, 64, 128, 256, 512] + }) # type: dict + self.full_network = kwargs.get('full_network', None) # type: Optional[str] + self.full_network_params = kwargs.get('full_network_params', dict()) # type: dict self.n_classes = kwargs.get('n_classes', None) # type: int - self.pretrained_model_name = kwargs.get('pretrained_model_name', None) # type: str - self.max_depth = kwargs.get('max_depth', 512) # type: int - - if self.pretrained_model_name == 'vgg16': - model_class = VGG16ModelParams - elif self.pretrained_model_name == 'resnet50': - model_class = ResNetModelParams - elif self.pretrained_model_name == 'unet': - model_class = UNetModelParams - else: - raise NotImplementedError - self.pretrained_model_file = kwargs.get('pretrained_model_file', model_class.PRETRAINED_MODEL_FILE) - self.intermediate_conv = kwargs.get('intermediate_conv', model_class.INTERMEDIATE_CONV) - self.upscale_params = kwargs.get('upscale_params', model_class.UPSCALE_PARAMS) - self.selected_levels_upscaling = kwargs.get('selected_levels_upscaling', model_class.SELECTED_LAYERS_UPSCALING) - self.correct_resnet_version = kwargs.get('correct_resnet_version', model_class.CORRECT_VERSION) self.check_params() - def check_params(self): - # Pretrained model name check - # assert self.upscale_params is not None and self.selected_levels_upscaling is not None, \ - # 'Model parameters cannot be None' - if self.upscale_params is not None and self.selected_levels_upscaling is not None: + def get_encoder(self) -> Type[Encoder]: + encoder = get_class_from_name(self.encoder_network) + assert issubclass(encoder, Encoder), "{} is not an Encoder".format(encoder) + return encoder - assert len(self.upscale_params) == len(self.selected_levels_upscaling), \ - 'Upscaling levels and selection levels must have the same lengths (in model_params definition), ' \ - '{} != {}'.format(len(self.upscale_params), - len(self.selected_levels_upscaling)) + def get_decoder(self) -> Type[Decoder]: + decoder = get_class_from_name(self.decoder_network) + assert issubclass(decoder, Decoder), "{} is not a Decoder".format(decoder) + return decoder - # assert os.path.isfile(self.pretrained_model_file), \ - # 'Pretrained weights file {} not found'.format(self.pretrained_model_file) - if not os.path.isfile(self.pretrained_model_file): - warnings.warn('WARNING - Default pretrained weights file in {} was not found. ' - 'Have you changed the default pretrained file ?'.format(self.pretrained_model_file)) + def check_params(self): + self.get_encoder() + self.get_decoder() class TrainingParams(BaseParams): @@ -208,6 +145,7 @@ def __init__(self, **kwargs): self.patch_shape = kwargs.get('patch_shape', (300, 300)) self.input_resized_size = int(kwargs.get('input_resized_size', 72e4)) # (600*1200) self.weights_labels = kwargs.get('weights_labels') + self.weights_evaluation_miou = kwargs.get('weights_evaluation_miou', None) self.training_margin = kwargs.get('training_margin', 16) self.local_entropy_ratio = kwargs.get('local_entropy_ratio', 0.) self.local_entropy_sigma = kwargs.get('local_entropy_sigma', 3) @@ -216,4 +154,4 @@ def __init__(self, **kwargs): def check_params(self) -> None: """Checks if there is no parameter inconsistency """ - assert self.training_margin*2 < min(self.patch_shape) \ No newline at end of file + assert self.training_margin*2 < min(self.patch_shape) diff --git a/dh_segment_train b/dh_segment_train new file mode 100644 index 0000000..6beaefd --- /dev/null +++ b/dh_segment_train @@ -0,0 +1,7 @@ +#!/usr/bin/env python + +from dh_segment.train import ex +import sys + +if __name__ == '__main__': + ex.run_commandline(sys.argv+["--force"]) diff --git a/doc/changelog.rst b/doc/changelog.rst index 0ecfd19..bc7bc83 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -2,4 +2,32 @@ Changelog ========= -TBC \ No newline at end of file +Unreleased +---------- + +0.4.0 - 2019-04-10 +------------------ +Added +^^^^^ + +* Input data can be a .csv file with format ``,``. +* ``dh_segment.io.via`` helper functions to generate/export groundtruth from/to VGG Image Annotation tool. +* ``Point.array_to_point`` to export a ``np.array`` into a list of ``Point``. +* PAGEXML Regions can now contain a custom attribute (Transkribus output of region annotation) +* ``Page.to_json()`` method for json formatting. + +Changed +^^^^^^^ + +* ``tensorflow`` v1.13 and ``opencv`` v4.0 are now used. +* mIOU metric for evaluation during training (instead of accuracy). +* TextLines are sorted according to their mean `y` coordinate when exported. + +Fixed +^^^^^ + +* Variable names typos in ``input.py`` and ``train.py``. +* Documentation of the quickstart demo. + +Removed +^^^^^^^ diff --git a/doc/reference/io.rst b/doc/reference/io.rst index 48756f1..f3159a8 100644 --- a/doc/reference/io.rst +++ b/doc/reference/io.rst @@ -76,3 +76,8 @@ Input / Output .. automodule:: dh_segment.io.PAGE :members: :undoc-members: + +.. automodule:: dh_segment.io.via + :members: + :undoc-members: + :exclude-members: main, init_logger \ No newline at end of file diff --git a/doc/start/annotating.rst b/doc/start/annotating.rst new file mode 100644 index 0000000..8ea11ed --- /dev/null +++ b/doc/start/annotating.rst @@ -0,0 +1,52 @@ +Creating groundtruth data +------------------------- + +Using GIMP or Photoshop +^^^^^^^^^^^^^^^^^^^^^^^ +Create directly your masks using your favorite image editor. You just have to draw the regions you want to extract +with a different color for each label. + +Using VGG Image Annotator (VIA) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +`VGG Image Annotator (VIA) `_ is an image annotation tool that can be +used to define regions in an image and create textual descriptions of those regions. You can either use it +`online `_ or +`download the application `_. + +From the exported annotations (in JSON format), you'll have to generate the corresponding image masks. +See the :ref:`ref_via` in the ``via`` module. + +When assigning attributes to your annotated regions, you should favour attributes of type "dropdown", "checkbox" +and "radio" and avoid "text" type in order to ease the parsing of the exported file (avoid typos and formatting errors). + +**Example of how to create individual masks from VIA annotation file** + +.. code:: python + + from dh_segment.io import via + + collection = 'mycollection' + annotation_file = 'via_sample.json' + masks_dir = '/home/project/generated_masks' + images_dir = './my_images' + + # Load all the data in the annotation file + # (the file may be an exported project or an export of the annotations) + via_data = via.load_annotation_data(annotation_file) + + # In the case of an exported project file, you can set ``only_img_annotations=True`` + # to get only the image annotations + via_annotations = via.load_annotation_data(annotation_file, only_img_annotations=True) + + # Collect the annotated regions + working_items = via.collect_working_items(via_annotations, collection, images_dir) + + # Collect the attributes and options + if '_via_attributes' in via_data.keys(): + list_attributes = via.parse_via_attributes(via_data['_via_attributes']) + else: + list_attributes = via.get_via_attributes(via_annotations) + + # Create one mask per option per attribute + via.create_masks(masks_dir, working_items, list_attributes, collection) + diff --git a/doc/start/demo.rst b/doc/start/demo.rst index d8b4d74..0a4c68d 100644 --- a/doc/start/demo.rst +++ b/doc/start/demo.rst @@ -11,8 +11,12 @@ In order to limit memory usage, the images in the dataset we provide have been d **How to** +0. If you have not yet done so, clone the repository : :: + + git clone https://github.com/dhlab-epfl/dhSegment.git + 1. Get the annotated dataset `here`_, which already contains the folders ``images`` and ``labels`` -for training, validation and testing set. Unzip it into ``model/pages``. :: +for training, validation and testing set. Unzip it into ``demo/pages``. :: cd demo/ wget https://github.com/dhlab-epfl/dhSegment/releases/download/v0.2/pages.zip diff --git a/doc/start/index.rst b/doc/start/index.rst index 208f598..e48554d 100644 --- a/doc/start/index.rst +++ b/doc/start/index.rst @@ -3,5 +3,6 @@ Quickstart .. toctree:: install + annotating training demo \ No newline at end of file diff --git a/doc/start/install.rst b/doc/start/install.rst index 734a3de..391d627 100644 --- a/doc/start/install.rst +++ b/doc/start/install.rst @@ -1,24 +1,33 @@ Installation ------------ +Using ``pip`` +^^^^^^^^^^^^^ + +1. Clone the repository using ``git clone https://github.com/dhlab-epfl/dhSegment.git`` + +2. Create and activate a virtualenv :: + + virtualenv myvirtualenvs/dh_segment + source myvirtualenvs/dh_segment/bin/activate + +3. Install the dependencies using ``pip`` (this will look for the ``setup.py`` file) :: + + pip install git+https://github.com/dhlab-epfl/dhSegment + Using Anaconda ^^^^^^^^^^^^^^ -- Install Anaconda or Miniconda +1. Install Anaconda or Miniconda (`installation procedure `_) -- Create a virtual environment with all the packages ``conda env create -f environment.yml`` +2. Clone the repository: ``git clone https://github.com/dhlab-epfl/dhSegment.git`` -- Then activate the environment with ``source activate dh_segment`` +3. Create a virtual environment with all the packages: ``conda env create -f environment.yml`` -- It might be possible that the following needs to be added to your ``~/.bashrc`` :: +4. Then activate the environment with ``source activate dh_segment`` - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" - export CUDA_HOME=/usr/local/cuda -- To be able to import the package (i.e ``import dh_segment``) in your code, you have to run : :: +5. To be able to import the package (i.e ``import dh_segment``) in your code, you have to run : :: python setup.py install - -Using ``pip`` -^^^^^^^^^^^^^ \ No newline at end of file diff --git a/doc/start/training.rst b/doc/start/training.rst index 1e21039..f033aec 100644 --- a/doc/start/training.rst +++ b/doc/start/training.rst @@ -4,6 +4,8 @@ Training .. note:: A good nvidia GPU (6GB RAM at least) is most likely necessary to train your own models. We assume CUDA and cuDNN are installed. +**Input data** + You need to have your training data in a folder containing ``images`` folder and ``labels`` folder. The pairs (images, labels) need to have the same name (it is not mandatory to have the same extension file, however we recommend having the label images as ``.png`` files). @@ -14,19 +16,62 @@ a specific color. .. note:: It is now also possible to use a `csv` file containing the pairs ``original_image_filename``, ``label_image_filename`` as input data. +To input a ``csv`` file instead of the two folders ``images`` and ``labels``, +the content should be formatted in the following way: :: + + mypath/myfolder/original_image_filename1,mypath/myfolder/label_image_filename1 + mypath/myfolder/original_image_filename2,mypath/myfolder/label_image_filename2 + + + +**The class.txt file** + The file containing the classes has the format shown below, where each row corresponds to one class (including 'negative' or 'background' class) and each row has 3 values for the 3 RGB values. Of course each class needs to have a different code. :: - class.txt + classes.txt 0 0 0 0 255 0 ... +**Config file with ``sacred``** + `sacred`_ package is used to deal with experiments and trainings. Have a look at the documentation to use it properly. In order to train a model, you should run ``python train.py with `` -.. _sacred: https://sacred.readthedocs.io/en/latest/quickstart.html \ No newline at end of file +.. _sacred: https://sacred.readthedocs.io/en/latest/quickstart.html + + +Multilabel classification training +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In case you want to be able to assign multiple labels to elements, the ``classes.txt`` file must be changed. +Besides the color code, you need to add an *attribution* code to each color. The attribution code has length `n_classes` +and indicates which classes are assigned to the color. + +Take for example 3 classes {A, B, C} and the following possible labelling combinations: + +- A (color code ``(0 255 0)``) with attribution code ``1 0 0`` +- B (color code ``(255 0 0)``) with attribution code ``0 1 0`` +- C (color code ``(0 0 255)``) with attribution code ``0 0 1`` +- AB (color code ``(128 128 128)``) with attribution code ``1 1 0`` +- BC (color code ``(0 255 255)``) with attribution code ``0 1 1`` + +The attributions code has value ``1`` when the label is assigned and ``0`` when it's not. +(The attribution code ``1 0 1`` would mean that the color annotates elements that belong to classes A and C) + +In our example the ``classes.txt`` file would then look like : :: + + + classes.txt + + 0 0 0 0 0 0 + 0 255 0 1 0 0 + 255 0 0 0 1 0 + 0 0 255 0 0 1 + 128 128 128 1 1 0 + 0 255 255 0 1 1 diff --git a/environment.yml b/environment.yml index 05f572f..f032a88 100644 --- a/environment.yml +++ b/environment.yml @@ -2,23 +2,24 @@ name: dh_segment channels: - defaults dependencies: - - imageio=2.3.0 - - opencv=3.4.1 - - pandas=0.23.0 - - pillow=5.1.0 + - imageio=2.5.0 + - numpy=1.16.2 + - pandas=0.24.2 + - pillow=5.4.1 - python=3.6 - - scikit-image=0.13.1 - - scikit-learn=0.19.1 - - scipy=1.1.0 - - setuptools=39.1.0 + - scikit-image=0.14.2 + - scikit-learn=0.20.3 + - scipy=1.2.1 + - setuptools=40.8.0 - shapely=1.6.4 - - tqdm=4.23.3 + - tensorflow-gpu==1.13.1 + - tqdm=4.31.1 + - requests=2.21.0 - pip: - better-exceptions==0.2.1 - - sacred==0.7.3 - - tensorflow-gpu==1.11 - - sphinx==1.8.1 - - sphinx-autodoc-typehints==1.3.0 - - sphinx-rtd-theme==0.4.1 - - sphinxcontrib-bibtex==0.4.0 - + - opencv-python==4.0.1.23 + - sacred==0.7.4 + - sphinx + - sphinx-autodoc-typehints + - sphinx-rtd-theme + - sphinxcontrib-bibtex diff --git a/general_config.json b/general_config.json index 3101094..a49a268 100644 --- a/general_config.json +++ b/general_config.json @@ -14,17 +14,13 @@ "evaluate_every_epoch" : 10 }, "model_params": { - "batch_norm": true, - "batch_renorm": true, - "selected_levels_upscaling": [ - true, - true, - true, - true, - true - ] + "encoder_network_params": { + "weight_decay": 1e-6 + } }, - "pretrained_model_name" : "resnet50", "prediction_type": "CLASSIFICATION", - "gpu" : "0" + "train_data" : "", + "eval_data" : "", + "classes_file" : "/classes.txt", + "model_output_dir" : "" } \ No newline at end of file diff --git a/pretrained_models/download_resnet_pretrained_model.py b/pretrained_models/download_resnet_pretrained_model.py deleted file mode 100644 index 42943fe..0000000 --- a/pretrained_models/download_resnet_pretrained_model.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python - -import urllib.request -import tarfile -import os -from tqdm import tqdm - - -def progress_hook(t): - last_b = [0] - - def update_to(b=1, bsize=1, tsize=None): - """ - b : int, optional - Number of blocks transferred so far [default: 1]. - bsize : int, optional - Size of each block (in tqdm units) [default: 1]. - tsize : int, optional - Total size (in tqdm units). If [default: None] remains unchanged. - """ - if tsize is not None: - t.total = tsize - t.update((b - last_b[0]) * bsize) - last_b[0] = b - - return update_to - - -if __name__ == '__main__': - tar_filename = 'resnet_v1_50.tar.gz' - with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, - desc="Downloading pre-trained weights") as t: - urllib.request.urlretrieve('http://download.tensorflow.org/models/resnet_v1_50_2016_08_28.tar.gz', tar_filename, - reporthook=progress_hook(t)) - tar = tarfile.open(tar_filename) - tar.extractall() - tar.close() - print('Resnet pre-trained weights downloaded!') - os.remove(tar_filename) diff --git a/pretrained_models/download_vgg_pretrained_model.py b/pretrained_models/download_vgg_pretrained_model.py deleted file mode 100644 index d38d89f..0000000 --- a/pretrained_models/download_vgg_pretrained_model.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python - -import urllib.request -import tarfile -import os -from tqdm import tqdm - - -def progress_hook(t): - last_b = [0] - - def update_to(b=1, bsize=1, tsize=None): - """ - b : int, optional - Number of blocks transferred so far [default: 1]. - bsize : int, optional - Size of each block (in tqdm units) [default: 1]. - tsize : int, optional - Total size (in tqdm units). If [default: None] remains unchanged. - """ - if tsize is not None: - t.total = tsize - t.update((b - last_b[0]) * bsize) - last_b[0] = b - - return update_to - - -if __name__ == '__main__': - tar_filename = 'vgg_16.tar.gz' - with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, - desc="Downloading pre-trained weights") as t: - urllib.request.urlretrieve('http://download.tensorflow.org/models/vgg_16_2016_08_28.tar.gz', tar_filename, - reporthook=progress_hook(t)) - tar = tarfile.open(tar_filename) - tar.extractall() - tar.close() - print('VGG-16 pre-trained weights downloaded!') - os.remove(tar_filename) diff --git a/setup.py b/setup.py index aca9532..cc444cb 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup(name='dh_segment', - version='0.3', + version='0.4.0', license='GPL', url='https://github.com/dhlab-epfl/dhSegment', description='Generic framework for historical document processing', @@ -11,23 +11,27 @@ 'Paper': 'https://arxiv.org/abs/1804.10371', 'Source Code': 'https://github.com/dhlab-epfl/dhSegment' }, + scripts=['dh_segment_train'], install_requires=[ - 'tensorflow', - 'numpy', - 'imageio', - 'pandas', - 'scipy', - 'shapely', - 'scikit-learn', - 'opencv-python', - 'tqdm', + #'tensorflow-gpu==1.13.1', + 'numpy==1.16.2', + 'imageio==2.5.0', + 'pandas==0.24.2', + 'scipy==1.2.1', + 'shapely==1.6.4', + 'scikit-learn==0.20.3', + 'scikit-image==0.15.0', + 'opencv-python==4.0.1.23', + 'tqdm==4.31.1', + 'sacred==0.7.4', + 'requests==2.21.0' ], extras_require={ 'doc': [ - 'sphinx', - 'sphinx-autodoc-typehints', - 'sphinx-rtd-theme', - 'sphinxcontrib-bibtex', + 'sphinx==1.8.1', + 'sphinx-autodoc-typehints==1.3.0', + 'sphinx-rtd-theme==0.4.1', + 'sphinxcontrib-bibtex==0.4.0', 'sphinxcontrib-websupport' ], },