.. code:: ipython3

    %load_ext autoreload
    %autoreload 2

.. code:: ipython3

    import sys
    if '..' not in sys.path:
        sys.path.append('..')
    
    from ultrayolo import YoloV3, losses
    from ultrayolo.datasets import CocoFormatDataset, common
    from ultrayolo.helpers import draw
    from pathlib import Path
    import numpy as np
    import tensorflow as tf
    import matplotlib.pyplot as plt

Yolo Loss Tutorial
==================

The first create some random anchors and use the default masks

.. code:: ipython3

    x = np.arange(5, 46, 5)
    anchors = np.array(list(zip(x,x)), dtype=np.float32)
    anchors[:,1] += np.random.randint(0, 10, 9)
    anchors


.. parsed-literal::

    array([[ 5., 14.],
           [10., 16.],
           [15., 21.],
           [20., 21.],
           [25., 26.],
           [30., 39.],
           [35., 41.],
           [40., 45.],
           [45., 47.]], dtype=float32)


.. code:: ipython3

    masks = YoloV3.default_masks
    masks


.. parsed-literal::

    array([[6, 7, 8],
           [3, 4, 5],
           [0, 1, 2]])


And load the dataset using the SequenceDataset
----------------------------------------------

.. code:: ipython3

    filepath = Path('./toy_dataset/data_annotations_train.txt')
    target_shape = (512, 512, 3)
    batch_size = 2
    is_training = True
    max_objects = 10
    
    
    train_seq = CocoFormatDataset('../minicoco_dataset/hair_drier_toaster_bear.json',
                                      target_shape,
                                      max_objects,
                                      batch_size, 
                                      anchors,
                                      masks,
                                      base_grid_size=128,
                                      is_training=is_training)


.. parsed-literal::

    load coco annotations: 100%|██████████| 1714/1714 [00:00<00:00, 797895.34it/s]


.. code:: ipython3

    train_seq.classes


.. parsed-literal::

    [(23, 'bear'), (80, 'toaster'), (89, 'hair drier')]


Now we take a batch from the dataset
------------------------------------

.. code:: ipython3

    x_true, y_true_grids = train_seq[0]

The batch contains: - 2 images

.. code:: ipython3

    x_true.shape


.. parsed-literal::

    (2, 512, 512, 3)


-  3 grids

.. code:: ipython3

    for i in range(len(y_true_grids)): 
        print(i, '-->', y_true_grids[i].shape, target_shape[0] / y_true_grids[i].shape[1])


.. parsed-literal::

    0 --> (2, 4, 4, 3, 8) 128.0
    1 --> (2, 8, 8, 3, 8) 64.0
    2 --> (2, 16, 16, 3, 8) 32.0


The third value plotted represents the size in number of pixel of grid
cells

Check that the dataset transformed is correct
---------------------------------------------

.. code:: ipython3

    for img_idx in range(len(x_true)):
        print('Show annotations for image', img_idx)
        img = x_true[img_idx]
    
        for i in range(len(y_true_grids)):
            y_data_grid_img = y_true_grids[i][img_idx]
            
            ax = draw.show_img(img)
            grid_len = y_data_grid_img.shape[1]
            draw.grid(ax, img.shape[:2], grid_len)
    
            grid_cell_size = target_shape[1] / grid_len
            
            for grid_y, grid_x, box in np.argwhere(np.sum(y_data_grid_img[..., :4], axis=-1) > 0):
                box_xyxy = (y_data_grid_img[grid_y,grid_x,box, :4] * target_shape[0]).astype(int)
                class_id = np.argwhere(y_data_grid_img[grid_y,grid_x,box, 5:])[0][0]
                draw.rect(ax, box_xyxy, 'white', 1)
                print(y_data_grid_img[grid_y,grid_x,box, :4])
                print(box_xyxy)
    
                rect_resp = np.array([grid_x, grid_y]) * grid_cell_size
                rect_resp = np.concatenate([rect_resp, rect_resp + grid_cell_size])
                draw.rect(ax, rect_resp, 'blue', 2)
    
                draw.point(ax, common.to_center_width_height(box_xyxy)[:2])
            plt.show()


.. parsed-literal::

    Show annotations for image 0
    [0.86732817 0.36640626 0.9580156  0.45892185]
    [444 187 490 234]
    [0.8681094  0.48279685 0.9980469  0.6025    ]
    [444 247 511 308]


.. image:: 3_check_yolo_loss_files/3_check_yolo_loss_16_1.png


.. image:: 3_check_yolo_loss_files/3_check_yolo_loss_16_2.png


.. image:: 3_check_yolo_loss_files/3_check_yolo_loss_16_3.png


.. parsed-literal::

    Show annotations for image 1
    [0.09103125 0.06715625 0.9980469  0.65957814]
    [ 46  34 511 337]


.. image:: 3_check_yolo_loss_files/3_check_yolo_loss_16_5.png


.. image:: 3_check_yolo_loss_files/3_check_yolo_loss_16_6.png


.. image:: 3_check_yolo_loss_files/3_check_yolo_loss_16_7.png


Create the model

.. code:: ipython3

    model = YoloV3(target_shape, max_objects, 
                   anchors=anchors, num_classes=train_seq.num_classes, 
                   training=True, backbone='DarkNet', base_grid_size=128)


.. parsed-literal::

    num pooling 2


tf.keras.utils.plot_model(model.model, show_shapes=True)

Evaluate how the loss works
---------------------------

We consider two cases:

1. when we got an initialized network the predictions should be around
   0.5 (max entropy)
2. when we give as prediction the right labels the loss should be close
   to zero

.. code:: ipython3

    y_pred_grids = model(x_true)
    for y_pred in y_pred_grids:
        print(y_pred.shape)


.. parsed-literal::

    (2, 4, 4, 3, 8)
    (2, 8, 8, 3, 8)
    (2, 16, 16, 3, 8)


We take i=0 since all the images are in the first grid

.. code:: ipython3

    i = 0
    y_true = y_true_grids[i]
    y_pred = y_pred_grids[i]
    y_pred = tf.slice(y_pred, begin=[0,0,0,0,0], size=y_true.shape)
    anchors_masks = anchors[masks[i]]
    img_size = target_shape[0]
    loss_fn = losses.make_loss(train_seq.num_classes, anchors, masks, img_size, len(train_seq))
    ignore_threshold = 0.7

First Case
~~~~~~~~~~

.. code:: ipython3

    from ultrayolo.losses import YoloLoss

.. code:: ipython3

    def to_box_xyxy(box_xy, box_wh, grid_size, anchors_masks):
        """convert the given boxes into the xy_min xy_max format
        Arguments:
            box_xy {tf.tensor} --
            box_wh {tf,tensor} --
            grid_size {float} -- the size of the grid used
            anchors_masks {tf.tensor} -- the anchor masks
        Returns:
            tf.tensor -- the boxes
        """
        # !!! grid[x][y] == (y, x)
        grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
        grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)    # [gx, gy, 1, 2]
        grid = tf.cast(grid, tf.float32)
    
        box_xy = (box_xy + grid) / tf.cast(grid_size, tf.float32)
        box_wh = tf.exp(box_wh) * anchors_masks
    
        box_wh = tf.where(tf.math.is_inf(box_wh), tf.zeros_like(box_wh), box_wh)
    
        box_x1y1 = box_xy - box_wh / 2
        box_x2y2 = box_xy + box_wh / 2
        box_xyxy = tf.concat([box_x1y1, box_x2y2], axis=-1)
    
        return box_xyxy
    
    def process_predictions(y_pred, num_classes, anchors_masks):
        """process the predictions to transform from:
        -  pred_xy, pred_wh, pred_obj, pred_class
        into
        - box_xyxy, pred_obj, pred_class, pred_xywh
    
        Arguments:
            y_pred {tf.tensor} -- the predictions in the format 
                (NBATCH, x_center, y_center, width, heigth, obj, one_hot_classes)
            num_classes {int} -- the number of classes
            anchors_masks {tf.tensor} -- the anchors masks
    
        Returns:
            tuple -- box_xyxy, pred_obj, pred_class, pred_xywh
        """
        # anchors_masks = tf.gather(anchors, masks)
    
        pred_xy, pred_wh, pred_obj, pred_class = tf.split(y_pred,
                                                          (2, 2, 1, num_classes),
                                                          axis=-1)
    
        pred_xy = tf.sigmoid(pred_xy)
        pred_obj = tf.sigmoid(pred_obj)
        pred_class = tf.sigmoid(pred_class)
        pred_xywh = tf.concat((pred_xy, pred_wh), axis=-1)
    
        grid_size = tf.shape(y_pred)[1]
        box_xyxy = to_box_xyxy(pred_xy, pred_wh, grid_size, anchors_masks)
    
        return box_xyxy, pred_obj, pred_class, pred_xywh

.. code:: ipython3

    # 1. transform all pred outputs
    # y_pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls))
    anchors_masks_scaled = anchors_masks / img_size
    pred_xyxy, pred_obj, pred_class, pred_xywh = process_predictions(
                tf.cast(y_pred, tf.float32), train_seq.num_classes, anchors_masks_scaled
    )
    pred_xy = pred_xywh[..., 0:2]
    pred_wh = pred_xywh[..., 2:4]

We expect that considering the variable ``pred_xywh`` the predictions
should be: - for xy in in average 0.5 - for wh close to 0 - for xy1, xy2
close to 0.5

While considering ``pred_xyxy`` it should be around 0.5

.. code:: ipython3

    print('average xy', tf.reduce_mean(pred_xy))
    print('average hw', tf.reduce_mean(pred_wh))
    print('average xyxy', tf.reduce_mean(pred_xyxy))


.. parsed-literal::

    average xy tf.Tensor(0.5, shape=(), dtype=float32)
    average hw tf.Tensor(6.6419275e-09, shape=(), dtype=float32)
    average xyxy tf.Tensor(0.5, shape=(), dtype=float32)


This is valid for all the objecteness and classes

.. code:: ipython3

    print('average pred_obj', tf.reduce_mean(pred_obj))
    print('average pred_class', tf.reduce_mean(pred_class))


.. parsed-literal::

    average pred_obj tf.Tensor(0.5, shape=(), dtype=float32)
    average pred_class tf.Tensor(0.5, shape=(), dtype=float32)


.. code:: ipython3

    # 2. transform all true outputs
    # y_true: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls))
    true_box_xyxy, true_obj, true_class = tf.split(
        y_true, (4, 1, train_seq.num_classes), axis=-1)
    true_xy = (true_box_xyxy[..., 0:2] + true_box_xyxy[..., 2:4]) / 2
    true_wh = true_box_xyxy[..., 2:4] - true_box_xyxy[..., 0:2]

.. code:: ipython3

    box_loss_scale = 2 - true_wh[..., 0] * true_wh[..., 1]

3. inverting the pred box equations, to make it comparable with the
   transformations done for the predictions

.. code:: ipython3

    grid_size = tf.shape(y_true)[1]
    grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
    grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)
    true_xy = true_xy * tf.cast(grid_size, tf.float32) - \
        tf.cast(grid, tf.float32)
    
    true_wh = tf.math.log(true_wh / anchors_masks_scaled)
    true_wh = tf.where(tf.math.is_inf(true_wh),
                       tf.zeros_like(true_wh), true_wh)

The line 8 contains the opposite transformation made for the predictions

::

   box_wh = tf.exp(box_wh) * anchors_masks

The masks are used to: 1. separate the boxes that contain objects and
should be considered in the objects loss 2. from the boxes that not
contain objects and should be considered in the **no object loss**

.. code:: ipython3

    # 4. calculate all masks
    obj_mask = tf.squeeze(true_obj, -1)
    # ignore false positive when iou is over threshold
    true_box_mask = tf.boolean_mask(
        true_box_xyxy, tf.cast(obj_mask, tf.bool))
    best_iou = tf.reduce_max(YoloLoss.broadcast_iou(
        pred_xyxy, true_box_mask), axis=-1)
    ignore_mask = tf.cast(best_iou < ignore_threshold, tf.float32)

Compute all the losses

-  xy, wh only with respect the objects that contains elements

.. code:: ipython3

    xy_loss = obj_mask * box_loss_scale * \
        tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)
    wh_loss = obj_mask * box_loss_scale * \
        tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)

-  the object and no object loss

You can check the whenever the loss is different to zero in the
``obj_loss`` is zero in the ``no_obj_loss`` and vice-versa

.. code:: ipython3

    obj_cross_entropy = tf.keras.metrics.binary_crossentropy(
        true_obj, pred_obj, from_logits=False)
    obj_loss = obj_mask * obj_cross_entropy
    no_obj_loss = (1 - obj_mask) * ignore_mask * obj_cross_entropy

-  The class loss is computed only for the cells the contains objects

.. code:: ipython3

    class_loss = obj_mask * tf.keras.metrics.binary_crossentropy(
                true_class, pred_class, from_logits=False)

-  everything is reduced to one value per image

.. code:: ipython3

    xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3))
    wh_loss = tf.reduce_sum(wh_loss, axis=(1, 2, 3))
    obj_loss = tf.reduce_sum(obj_loss, axis=(1, 2, 3))
    no_obj_loss = tf.reduce_sum(no_obj_loss, axis=(1, 2, 3))
    class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3))
    
    loss = xy_loss + wh_loss + obj_loss + no_obj_loss + class_loss
    
    loss


.. parsed-literal::

    <tf.Tensor: shape=(2,), dtype=float32, numpy=array([35.51542, 47.17279], dtype=float32)>


Second Case
~~~~~~~~~~~

-  y_true == y_pred

.. code:: ipython3

    i = 0
    y_true = y_true_grids[i]
    y_pred = y_true

-  Remember that y_pred is in format xy_min xy_max

.. code:: ipython3

    pred_xyxy, pred_obj, pred_class = tf.split(
            y_pred, (4, 1, train_seq.num_classes), axis=-1)
    
    pred_xy = (pred_xyxy[..., 0:2] + pred_xyxy[..., 2:4]) / 2
    pred_wh = pred_xyxy[..., 2:4] - pred_xyxy[..., 0:2]
    
    pred_xywh = tf.concat((pred_xy, pred_wh), axis=-1)

.. code:: ipython3

    true_box_xyxy, true_obj, true_class = tf.split(
                y_true, (4, 1, train_seq.num_classes), axis=-1)
    true_xy = (true_box_xyxy[..., 0:2] + true_box_xyxy[..., 2:4]) / 2
    true_wh = true_box_xyxy[..., 2:4] - true_box_xyxy[..., 0:2]

.. code:: ipython3

    box_loss_scale = 2 - true_wh[..., 0] * true_wh[..., 1]
    
    # 4. calculate all masks
    obj_mask = tf.squeeze(true_obj, -1)
    # ignore false positive when iou is over threshold
    true_box_mask = tf.boolean_mask(
        true_box_xyxy, tf.cast(obj_mask, tf.bool))
    best_iou = tf.reduce_max(YoloLoss.broadcast_iou(
        pred_xyxy, true_box_mask), axis=-1)
    ignore_mask = tf.cast(best_iou < ignore_threshold, tf.float32)

.. code:: ipython3

    # 5. compute all the losses
    xy_loss = obj_mask * box_loss_scale * \
        tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)
    wh_loss = obj_mask * box_loss_scale * \
        tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)

.. code:: ipython3

    obj_cross_entropy = tf.keras.metrics.binary_crossentropy(
        true_obj, pred_obj, from_logits=False)
    obj_loss = obj_mask * obj_cross_entropy
    no_obj_loss = (1 - obj_mask) * ignore_mask * obj_cross_entropy
    
    class_loss = obj_mask * tf.keras.metrics.binary_crossentropy(
        true_class, pred_class, from_logits=False)

.. code:: ipython3

    xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3))
    wh_loss = tf.reduce_sum(wh_loss, axis=(1, 2, 3))
    obj_loss = tf.reduce_sum(obj_loss, axis=(1, 2, 3))
    no_obj_loss = tf.reduce_sum(no_obj_loss, axis=(1, 2, 3))
    class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3))
    
    loss = xy_loss + wh_loss + obj_loss + no_obj_loss + class_loss
    loss


.. parsed-literal::

    <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0., 0.], dtype=float32)>


The loss is 0 when the prediction is equal to the true values

Conclusion
----------

we have verified that the loss: - return max entropy value when the
network is initialized, and - return 0 when the y_pred is equal to
y_true