model.py


r"""Provides network model definition and helper functions.

"An efficient solution for semantic segmentation: ShuffleNet V2 with atrous separable convolutions.",
Türkmen, Sercan, and Janne Heikkilä.
arXiv preprint arXiv:1902.07476 (2019).
(https://arxiv.org/abs/1902.07476)
"""
import tensorflow as tf
from tensorflow.contrib import slim
from core import dense_prediction_cell
from core import feature_extractor
from core import utils


LOGITS_SCOPE_NAME = 'logits'
MERGED_LOGITS_SCOPE = 'merged_logits'
IMAGE_POOLING_SCOPE = 'image_pooling'
ASPP_SCOPE = 'aspp'
CONCAT_PROJECTION_SCOPE = 'concat_projection'
DECODER_SCOPE = 'decoder'
META_ARCHITECTURE_SCOPE = 'meta_architecture'

scale_dimension = utils.scale_dimension
split_separable_conv2d = utils.split_separable_conv2d


def get_extra_layer_scopes(last_layers_contain_logits_only=False):
    """Gets the scopes for extra layers.

    Args:
      last_layers_contain_logits_only: Boolean, True if only consider logits as
      the last layer (i.e., exclude ASPP module, decoder module and so on)

    Returns:
      A list of scopes for extra layers.
    """
    if last_layers_contain_logits_only:
        return [LOGITS_SCOPE_NAME]
    else:
        return [
            LOGITS_SCOPE_NAME,
            IMAGE_POOLING_SCOPE,
            ASPP_SCOPE,
            CONCAT_PROJECTION_SCOPE,
            DECODER_SCOPE,
            META_ARCHITECTURE_SCOPE,
        ]


def predict_labels_multi_scale(images,
                               model_options,
                               eval_scales=(1.0,),
                               add_flipped_images=False):
    """Predicts segmentation labels.

    Args:
      images: A tensor of size [batch, height, width, channels].
      model_options: A ModelOptions instance to configure models.
      eval_scales: The scales to resize images for evaluation.
      add_flipped_images: Add flipped images for evaluation or not.

    Returns:
      A dictionary with keys specifying the output_type (e.g., semantic
        prediction) and values storing Tensors representing predictions (argmax
        over channels). Each prediction has size [batch, height, width].
    """
    outputs_to_predictions = {
        output: []
        for output in model_options.outputs_to_num_classes
    }

    for i, image_scale in enumerate(eval_scales):
        with tf.variable_scope(tf.get_variable_scope(), reuse=True if i else None):
            outputs_to_scales_to_logits = multi_scale_logits(
                images,
                model_options=model_options,
                image_pyramid=[image_scale],
                is_training=False,
                fine_tune_batch_norm=False)

        if add_flipped_images:
            with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                outputs_to_scales_to_logits_reversed = multi_scale_logits(
                    tf.reverse_v2(images, [2]),
                    model_options=model_options,
                    image_pyramid=[image_scale],
                    is_training=False,
                    fine_tune_batch_norm=False)

        for output in sorted(outputs_to_scales_to_logits):
            scales_to_logits = outputs_to_scales_to_logits[output]
            logits = tf.image.resize_bilinear(
                scales_to_logits[MERGED_LOGITS_SCOPE],
                tf.shape(images)[1:3],
                align_corners=True)
            outputs_to_predictions[output].append(
                tf.expand_dims(tf.nn.softmax(logits), 4))

            if add_flipped_images:
                scales_to_logits_reversed = (
                    outputs_to_scales_to_logits_reversed[output])
                logits_reversed = tf.image.resize_bilinear(
                    tf.reverse_v2(
                        scales_to_logits_reversed[MERGED_LOGITS_SCOPE], [2]),
                    tf.shape(images)[1:3],
                    align_corners=True)
                outputs_to_predictions[output].append(
                    tf.expand_dims(tf.nn.softmax(logits_reversed), 4))

    for output in sorted(outputs_to_predictions):
        predictions = outputs_to_predictions[output]
        # Compute average prediction across different scales and flipped images.
        predictions = tf.reduce_mean(tf.concat(predictions, 4), axis=4)
        outputs_to_predictions[output] = tf.argmax(predictions, 3)

    return outputs_to_predictions


def predict_labels(images, model_options, image_pyramid=None):
    """Predicts segmentation labels.

    Args:
      images: A tensor of size [batch, height, width, channels].
      model_options: A ModelOptions instance to configure models.
      image_pyramid: Input image scales for multi-scale feature extraction.

    Returns:
      A dictionary with keys specifying the output_type (e.g., semantic
        prediction) and values storing Tensors representing predictions (argmax
        over channels). Each prediction has size [batch, height, width].
    """
    outputs_to_scales_to_logits = multi_scale_logits(
        images,
        model_options=model_options,
        image_pyramid=image_pyramid,
        is_training=False,
        fine_tune_batch_norm=False)

    predictions = {}
    for output in sorted(outputs_to_scales_to_logits):
        scales_to_logits = outputs_to_scales_to_logits[output]
        logits = tf.image.resize_bilinear(
            scales_to_logits[MERGED_LOGITS_SCOPE],
            tf.shape(images)[1:3],
            align_corners=True)
        predictions[output] = tf.argmax(logits, 3)

    return predictions


def _resize_bilinear(images, size, output_dtype=tf.float32):
    """Returns resized images as output_type.

    Args:
      images: A tensor of size [batch, height_in, width_in, channels].
      size: A 1-D int32 Tensor of 2 elements: new_height, new_width. The new size
        for the images.
      output_dtype: The destination type.
    Returns:
      A tensor of size [batch, height_out, width_out, channels] as a dtype of
        output_dtype.
    """
    images = tf.image.resize_bilinear(images, size, align_corners=True)
    return tf.cast(images, dtype=output_dtype)


def multi_scale_logits(images,
                       model_options,
                       image_pyramid,
                       weight_decay=0.0001,
                       is_training=False,
                       fine_tune_batch_norm=False):
    """Gets the logits for multi-scale inputs.

    The returned logits are all downsampled (due to max-pooling layers)
    for both training and evaluation.

    Args:
      images: A tensor of size [batch, height, width, channels].
      model_options: A ModelOptions instance to configure models.
      image_pyramid: Input image scales for multi-scale feature extraction.
      weight_decay: The weight decay for model variables.
      is_training: Is training or not.
      fine_tune_batch_norm: Fine-tune the batch norm parameters or not.

    Returns:
      outputs_to_scales_to_logits: A map of maps from output_type (e.g.,
        semantic prediction) to a dictionary of multi-scale logits names to
        logits. For each output_type, the dictionary has keys which
        correspond to the scales and values which correspond to the logits.
        For example, if `scales` equals [1.0, 1.5], then the keys would
        include 'merged_logits', 'logits_1.00' and 'logits_1.50'.

    Raises:
      ValueError: If model_options doesn't specify crop_size and its
        add_image_level_feature = True, since add_image_level_feature requires
        crop_size information.
    """
    # Setup default values.
    if not image_pyramid:
        image_pyramid = [1.0]
    crop_height = (
        model_options.crop_size[0]
        if model_options.crop_size else tf.shape(images)[1])
    crop_width = (
        model_options.crop_size[1]
        if model_options.crop_size else tf.shape(images)[2])

    # Compute the height, width for the output logits.
    logits_output_stride = (
        model_options.decoder_output_stride or model_options.output_stride)

    logits_height = scale_dimension(
        crop_height,
        max(1.0, max(image_pyramid)) / logits_output_stride)
    logits_width = scale_dimension(
        crop_width,
        max(1.0, max(image_pyramid)) / logits_output_stride)

    # Compute the logits for each scale in the image pyramid.
    outputs_to_scales_to_logits = {
        k: {}
        for k in model_options.outputs_to_num_classes
    }

    for image_scale in image_pyramid:
        if image_scale != 1.0:
            scaled_height = scale_dimension(crop_height, image_scale)
            scaled_width = scale_dimension(crop_width, image_scale)
            scaled_crop_size = [scaled_height, scaled_width]
            scaled_images = tf.image.resize_bilinear(
                images, scaled_crop_size, align_corners=True)
            if model_options.crop_size:
                scaled_images.set_shape([None, scaled_height, scaled_width, 3])
        else:
            scaled_crop_size = model_options.crop_size
            scaled_images = images

        updated_options = model_options._replace(crop_size=scaled_crop_size)
        outputs_to_logits = _get_logits(
            scaled_images,
            updated_options,
            weight_decay=weight_decay,
            reuse=tf.AUTO_REUSE,
            is_training=is_training,
            fine_tune_batch_norm=fine_tune_batch_norm)

        # Resize the logits to have the same dimension before merging.
        for output in sorted(outputs_to_logits):
            outputs_to_logits[output] = tf.image.resize_bilinear(
                outputs_to_logits[output], [logits_height, logits_width],
                align_corners=True)

        # Return when only one input scale.
        if len(image_pyramid) == 1:
            for output in sorted(model_options.outputs_to_num_classes):
                outputs_to_scales_to_logits[output][
                    MERGED_LOGITS_SCOPE] = outputs_to_logits[output]
            return outputs_to_scales_to_logits

        # Save logits to the output map.
        for output in sorted(model_options.outputs_to_num_classes):
            outputs_to_scales_to_logits[output][
                'logits_%.2f' % image_scale] = outputs_to_logits[output]

    # Merge the logits from all the multi-scale inputs.
    for output in sorted(model_options.outputs_to_num_classes):
        # Concatenate the multi-scale logits for each output type.
        all_logits = [
            tf.expand_dims(logits, axis=4)
            for logits in outputs_to_scales_to_logits[output].values()
        ]
        all_logits = tf.concat(all_logits, 4)
        merge_fn = (
            tf.reduce_max
            if model_options.merge_method == 'max' else tf.reduce_mean)
        outputs_to_scales_to_logits[output][MERGED_LOGITS_SCOPE] = merge_fn(
            all_logits, axis=4)

    return outputs_to_scales_to_logits


def extract_features(images,
                     model_options,
                     weight_decay=0.0001,
                     reuse=None,
                     is_training=False,
                     fine_tune_batch_norm=False):
    """Extracts features by the particular model_variant.

    Args:
      images: A tensor of size [batch, height, width, channels].
      model_options: A ModelOptions instance to configure models.
      weight_decay: The weight decay for model variables.
      reuse: Reuse the model variables or not.
      is_training: Is training or not.
      fine_tune_batch_norm: Fine-tune the batch norm parameters or not.

    Returns:
      concat_logits: A tensor of size [batch, feature_height, feature_width,
        feature_channels], where feature_height/feature_width are determined by
        the images height/width and output_stride.
      end_points: A dictionary from components of the network to the corresponding
        activation.
    """
    features, end_points = feature_extractor.extract_features(
        images,
        output_stride=model_options.output_stride,
        multi_grid=model_options.multi_grid,
        model_variant=model_options.model_variant,
        depth_multiplier=model_options.depth_multiplier,
        weight_decay=weight_decay,
        reuse=reuse,
        is_training=is_training,
        fine_tune_batch_norm=fine_tune_batch_norm)

    if not model_options.aspp_with_batch_norm:
        return features, end_points
    else:
        if model_options.dense_prediction_cell_config is not None:
            tf.logging.info('Using dense prediction cell config.')
            dense_prediction_layer = dense_prediction_cell.DensePredictionCell(
                config=model_options.dense_prediction_cell_config,
                hparams={
                    'conv_rate_multiplier': 16 // model_options.output_stride,
                })
            concat_logits = dense_prediction_layer.build_cell(
                features,
                output_stride=model_options.output_stride,
                crop_size=model_options.crop_size,
                image_pooling_crop_size=model_options.image_pooling_crop_size,
                weight_decay=weight_decay,
                reuse=reuse,
                is_training=is_training,
                fine_tune_batch_norm=fine_tune_batch_norm)
            return concat_logits, end_points
        else:
            # The following codes employ the DeepLabv3 ASPP module. Note that We
            # could express the ASPP module as one particular dense prediction
            # cell architecture. We do not do so but leave the following codes in
            # order for backward compatibility.
            batch_norm_params = {
                'is_training': is_training and fine_tune_batch_norm,
                'decay': 0.9997,
                'epsilon': 1e-5,
                'scale': True,
            }

            with slim.arg_scope(
                [slim.conv2d, slim.separable_conv2d],
                weights_regularizer=slim.l2_regularizer(weight_decay),
                activation_fn=tf.nn.relu,
                normalizer_fn=slim.batch_norm,
                padding='SAME',
                stride=1,
                    reuse=reuse):
                with slim.arg_scope([slim.batch_norm], **batch_norm_params):
                    depth = 256
                    branch_logits = []

                    if model_options.add_image_level_feature:
                        if model_options.crop_size is not None:
                            image_pooling_crop_size = model_options.image_pooling_crop_size
                            # If image_pooling_crop_size is not specified, use crop_size.
                            if image_pooling_crop_size is None:
                                image_pooling_crop_size = model_options.crop_size
                            pool_height = scale_dimension(
                                image_pooling_crop_size[0],
                                1. / model_options.output_stride)
                            pool_width = scale_dimension(
                                image_pooling_crop_size[1],
                                1. / model_options.output_stride)
                            image_feature = slim.avg_pool2d(
                                features, [pool_height, pool_width], [1, 1], padding='VALID')
                            resize_height = scale_dimension(
                                model_options.crop_size[0],
                                1. / model_options.output_stride)
                            resize_width = scale_dimension(
                                model_options.crop_size[1],
                                1. / model_options.output_stride)
                        else:
                            # If crop_size is None, we simply do global pooling.
                            pool_height = tf.shape(features)[1]
                            pool_width = tf.shape(features)[2]
                            image_feature = tf.reduce_mean(
                                features, axis=[1, 2], keepdims=True)
                            resize_height = pool_height
                            resize_width = pool_width
                        image_feature = slim.conv2d(
                            image_feature, depth, 1, scope=IMAGE_POOLING_SCOPE)
                        image_feature = _resize_bilinear(
                            image_feature,
                            [resize_height, resize_width],
                            image_feature.dtype)
                        # Set shape for resize_height/resize_width if they are not Tensor.
                        if isinstance(resize_height, tf.Tensor):
                            resize_height = None
                        if isinstance(resize_width, tf.Tensor):
                            resize_width = None
                        image_feature.set_shape(
                            [None, resize_height, resize_width, depth])
                        branch_logits.append(image_feature)

                    # Employ a 1x1 convolution.
                    branch_logits.append(slim.conv2d(features, depth, 1,
                                                     scope=ASPP_SCOPE + str(0)))

                    if model_options.atrous_rates:
                        # Employ 3x3 convolutions with different atrous rates.
                        for i, rate in enumerate(model_options.atrous_rates, 1):
                            scope = ASPP_SCOPE + str(i)
                            if model_options.aspp_with_separable_conv:
                                aspp_features = split_separable_conv2d(
                                    features,
                                    filters=depth,
                                    rate=rate,
                                    weight_decay=weight_decay,
                                    scope=scope)
                            else:
                                aspp_features = slim.conv2d(
                                    features, depth, 3, rate=rate, scope=scope)
                            branch_logits.append(aspp_features)

                    # Merge branch logits.
                    concat_logits = tf.concat(branch_logits, 3)
                    concat_logits = slim.conv2d(
                        concat_logits, depth, 1, scope=CONCAT_PROJECTION_SCOPE)
                    concat_logits = slim.dropout(
                        concat_logits,
                        keep_prob=0.9,
                        is_training=is_training,
                        scope=CONCAT_PROJECTION_SCOPE + '_dropout')

                    return concat_logits, end_points


def _get_logits(images,
                model_options,
                weight_decay=0.0001,
                reuse=None,
                is_training=False,
                fine_tune_batch_norm=False):
    """Gets the logits by atrous/image spatial pyramid pooling.

    Args:
      images: A tensor of size [batch, height, width, channels].
      model_options: A ModelOptions instance to configure models.
      weight_decay: The weight decay for model variables.
      reuse: Reuse the model variables or not.
      is_training: Is training or not.
      fine_tune_batch_norm: Fine-tune the batch norm parameters or not.

    Returns:
      outputs_to_logits: A map from output_type to logits.
    """
    features, end_points = extract_features(
        images,
        model_options,
        weight_decay=weight_decay,
        reuse=reuse,
        is_training=is_training,
        fine_tune_batch_norm=fine_tune_batch_norm)

    if model_options.decoder_output_stride is not None:
        if model_options.crop_size is None:
            height = tf.shape(images)[1]
            width = tf.shape(images)[2]
        else:
            height, width = model_options.crop_size
        decoder_height = scale_dimension(height,
                                         1.0 / model_options.decoder_output_stride)
        decoder_width = scale_dimension(width,
                                        1.0 / model_options.decoder_output_stride)
        features = refine_by_decoder(
            features,
            end_points,
            decoder_height=decoder_height,
            decoder_width=decoder_width,
            decoder_use_separable_conv=model_options.decoder_use_separable_conv,
            model_variant=model_options.model_variant,
            weight_decay=weight_decay,
            reuse=reuse,
            is_training=is_training,
            fine_tune_batch_norm=fine_tune_batch_norm)

    outputs_to_logits = {}
    for output in sorted(model_options.outputs_to_num_classes):
        outputs_to_logits[output] = get_branch_logits(
            features,
            model_options.outputs_to_num_classes[output],
            model_options.atrous_rates,
            aspp_with_batch_norm=model_options.aspp_with_batch_norm,
            kernel_size=model_options.logits_kernel_size,
            weight_decay=weight_decay,
            reuse=reuse,
            scope_suffix=output)

    return outputs_to_logits


def refine_by_decoder(features,
                      end_points,
                      decoder_height,
                      decoder_width,
                      decoder_use_separable_conv=False,
                      model_variant=None,
                      weight_decay=0.0001,
                      reuse=None,
                      is_training=False,
                      fine_tune_batch_norm=False):
    """Adds the decoder to obtain sharper segmentation results.

    Args:
      features: A tensor of size [batch, features_height, features_width,
        features_channels].
      end_points: A dictionary from components of the network to the corresponding
        activation.
      decoder_height: The height of decoder feature maps.
      decoder_width: The width of decoder feature maps.
      decoder_use_separable_conv: Employ separable convolution for decoder or not.
      model_variant: Model variant for feature extraction.
      weight_decay: The weight decay for model variables.
      reuse: Reuse the model variables or not.
      is_training: Is training or not.
      fine_tune_batch_norm: Fine-tune the batch norm parameters or not.

    Returns:
      Decoder output with size [batch, decoder_height, decoder_width,
        decoder_channels].
    """
    batch_norm_params = {
        'is_training': is_training and fine_tune_batch_norm,
        'decay': 0.9997,
        'epsilon': 1e-5,
        'scale': True,
    }

    with slim.arg_scope(
        [slim.conv2d, slim.separable_conv2d],
        weights_regularizer=slim.l2_regularizer(weight_decay),
        activation_fn=tf.nn.relu,
        normalizer_fn=slim.batch_norm,
        padding='SAME',
        stride=1,
            reuse=reuse):
        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
            with tf.variable_scope(DECODER_SCOPE, DECODER_SCOPE, [features]):
                feature_list = feature_extractor.networks_to_feature_maps[
                    model_variant][feature_extractor.DECODER_END_POINTS]
                if feature_list is None:
                    tf.logging.info('Not found any decoder end points.')
                    return features
                else:
                    decoder_features = features
                    for i, name in enumerate(feature_list):
                        decoder_features_list = [decoder_features]

                        # MobileNet variants use different naming convention.
                        if 'mobilenet' in model_variant:
                            feature_name = name
                        else:
                            feature_name = '{}/{}'.format(
                                feature_extractor.name_scope[model_variant], name)
                        decoder_features_list.append(
                            slim.conv2d(
                                end_points[feature_name],
                                48,
                                1,
                                scope='feature_projection' + str(i)))
                        # Resize to decoder_height/decoder_width.
                        for j, feature in enumerate(decoder_features_list):
                            decoder_features_list[j] = tf.image.resize_bilinear(
                                feature, [decoder_height, decoder_width], align_corners=True)
                            h = (None if isinstance(decoder_height, tf.Tensor)
                                 else decoder_height)
                            w = (None if isinstance(decoder_width, tf.Tensor)
                                 else decoder_width)
                            decoder_features_list[j].set_shape(
                                [None, h, w, None])
                        decoder_depth = 256
                        if decoder_use_separable_conv:
                            decoder_features = split_separable_conv2d(
                                tf.concat(decoder_features_list, 3),
                                filters=decoder_depth,
                                rate=1,
                                weight_decay=weight_decay,
                                scope='decoder_conv0')
                            decoder_features = split_separable_conv2d(
                                decoder_features,
                                filters=decoder_depth,
                                rate=1,
                                weight_decay=weight_decay,
                                scope='decoder_conv1')
                        else:
                            num_convs = 2
                            decoder_features = slim.repeat(
                                tf.concat(decoder_features_list, 3),
                                num_convs,
                                slim.conv2d,
                                decoder_depth,
                                3,
                                scope='decoder_conv' + str(i))
                    return decoder_features


def get_branch_logits(features,
                      num_classes,
                      atrous_rates=None,
                      aspp_with_batch_norm=False,
                      kernel_size=1,
                      weight_decay=0.0001,
                      reuse=None,
                      scope_suffix=''):
    """Gets the logits from each model's branch.

    The underlying model is branched out in the last layer when atrous
    spatial pyramid pooling is employed, and all branches are sum-merged
    to form the final logits.

    Args:
      features: A float tensor of shape [batch, height, width, channels].
      num_classes: Number of classes to predict.
      atrous_rates: A list of atrous convolution rates for last layer.
      aspp_with_batch_norm: Use batch normalization layers for ASPP.
      kernel_size: Kernel size for convolution.
      weight_decay: Weight decay for the model variables.
      reuse: Reuse model variables or not.
      scope_suffix: Scope suffix for the model variables.

    Returns:
      Merged logits with shape [batch, height, width, num_classes].

    Raises:
      ValueError: Upon invalid input kernel_size value.
    """
    # When using batch normalization with ASPP, ASPP has been applied before
    # in extract_features, and thus we simply apply 1x1 convolution here.
    if aspp_with_batch_norm or atrous_rates is None:
        if kernel_size != 1:
            raise ValueError('Kernel size must be 1 when atrous_rates is None or '
                             'using aspp_with_batch_norm. Gets %d.' % kernel_size)
        atrous_rates = [1]

    with slim.arg_scope(
        [slim.conv2d],
        weights_regularizer=slim.l2_regularizer(weight_decay),
        weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
            reuse=reuse):
        with tf.variable_scope(LOGITS_SCOPE_NAME, LOGITS_SCOPE_NAME, [features]):
            branch_logits = []
            for i, rate in enumerate(atrous_rates):
                scope = scope_suffix
                if i:
                    scope += '_%d' % i

                branch_logits.append(
                    slim.conv2d(
                        features,
                        num_classes,
                        kernel_size=kernel_size,
                        rate=rate,
                        activation_fn=None,
                        normalizer_fn=None,
                        scope=scope))

            return tf.add_n(branch_logits)