Trusted-AI · beat-buesser · Sep 12, 2023 · Aug 30, 2023 · Aug 30, 2023 · Sep 1, 2023
diff --git a/art/estimators/object_detection/pytorch_faster_rcnn.py b/art/estimators/object_detection/pytorch_faster_rcnn.py
@@ -38,17 +38,17 @@
 
 class PyTorchFasterRCNN(PyTorchObjectDetector):
     """
-    This class implements a model-specific object detector using Faster-RCNN and PyTorch following the input and output
+    This class implements a model-specific object detector using Faster R-CNN and PyTorch following the input and output
     formats of torchvision.
     """
 
     def __init__(
         self,
-        model: Optional["torchvision.models.detection.fasterrcnn_resnet50_fpn"] = None,
+        model: Optional["torchvision.models.detection.FasterRCNN"] = None,
         input_shape: Tuple[int, ...] = (-1, -1, -1),
         optimizer: Optional["torch.optim.Optimizer"] = None,
         clip_values: Optional["CLIP_VALUES_TYPE"] = None,
-        channels_first: Optional[bool] = False,
+        channels_first: Optional[bool] = True,
         preprocessing_defences: Union["Preprocessor", List["Preprocessor"], None] = None,
         postprocessing_defences: Union["Postprocessor", List["Postprocessor"], None] = None,
         preprocessing: "PREPROCESSING_TYPE" = None,
@@ -63,13 +63,13 @@ def __init__(
         """
         Initialization.
 
-        :param model: Faster-RCNN model. The output of the model is `List[Dict[Tensor]]`, one for each input image. The
-                      fields of the Dict are as follows:
+        :param model: Faster R-CNN model. The output of the model is `List[Dict[str, torch.Tensor]]`, one for
+                      each input image. The fields of the Dict are as follows:
 
-                      - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values \
-                        between 0 and H and 0 and W
-                      - labels (Int64Tensor[N]): the predicted labels for each image
-                      - scores (Tensor[N]): the scores or each prediction
+                      - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and
+                        0 <= y1 < y2 <= H.
+                      - labels [N]: the labels for each image.
+                      - scores [N]: the scores of each prediction.
         :param input_shape: The shape of one input sample.
         :param optimizer: The optimizer for training the classifier.
         :param clip_values: Tuple of the form `(min, max)` of floats or `np.ndarray` representing the minimum and

diff --git a/art/estimators/object_detection/pytorch_object_detector.py b/art/estimators/object_detection/pytorch_object_detector.py
@@ -52,7 +52,7 @@ def __init__(
         input_shape: Tuple[int, ...] = (-1, -1, -1),
         optimizer: Optional["torch.optim.Optimizer"] = None,
         clip_values: Optional["CLIP_VALUES_TYPE"] = None,
-        channels_first: Optional[bool] = False,
+        channels_first: Optional[bool] = True,
         preprocessing_defences: Union["Preprocessor", List["Preprocessor"], None] = None,
         postprocessing_defences: Union["Postprocessor", List["Postprocessor"], None] = None,
         preprocessing: "PREPROCESSING_TYPE" = None,
@@ -67,13 +67,13 @@ def __init__(
         """
         Initialization.
 
-        :param model: Object detection model. The output of the model is `List[Dict[Tensor]]`, one for each input
-                      image. The fields of the Dict are as follows:
+        :param model: Object detection model. The output of the model is `List[Dict[str, torch.Tensor]]`, one for
+                      each input image. The fields of the Dict are as follows:
 
-                      - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values
-                        between 0 and H and 0 and W
-                      - labels (Int64Tensor[N]): the predicted labels for each image
-                      - scores (Tensor[N]): the scores or each prediction
+                      - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and
+                        0 <= y1 < y2 <= H.
+                      - labels [N]: the labels for each image.
+                      - scores [N]: the scores of each prediction.
         :param input_shape: The shape of one input sample.
         :param optimizer: The optimizer for training the classifier.
         :param clip_values: Tuple of the form `(min, max)` of floats or `np.ndarray` representing the minimum and
@@ -215,7 +215,7 @@ def _preprocess_and_convert_inputs(
 
             if not self.channels_first:
                 x_tensor = torch.permute(x_tensor, (0, 3, 1, 2))
-            x_tensor /= norm_factor
+            x_tensor = x_tensor / norm_factor
 
             # Set gradients
             if not no_grad:
@@ -236,7 +236,7 @@ def _preprocess_and_convert_inputs(
 
             if not self.channels_first:
                 x_preprocessed = torch.permute(x_preprocessed, (0, 3, 1, 2))
-            x_preprocessed /= norm_factor
+            x_preprocessed = x_preprocessed / norm_factor
 
             # Set gradients
             if not no_grad:

diff --git a/art/estimators/object_detection/pytorch_yolo.py b/art/estimators/object_detection/pytorch_yolo.py
@@ -103,8 +103,8 @@ def translate_labels_x1y1x2y2_to_xcycwh(
         labels[:, 2:6] = label_dict["boxes"]
 
         # normalize bounding boxes to [0, 1]
-        labels[:, 2:6:2] /= width
-        labels[:, 3:6:2] /= height
+        labels[:, 2:6:2] = labels[:, 2:6:2] / width
+        labels[:, 3:6:2] = labels[:, 3:6:2] / height
 
         # convert from x1y1x2y2 to xcycwh
         labels[:, 4] -= labels[:, 2]
@@ -148,7 +148,7 @@ def __init__(
         """
         Initialization.
 
-        :param model: Object detection model wrapped as demonstrated in examples/get_started_yolo.py.
+        :param model: YOLO v3 or v5 model wrapped as demonstrated in examples/get_started_yolo.py.
                       The output of the model is `List[Dict[str, torch.Tensor]]`, one for each input image.
                       The fields of the Dict are as follows:
 
@@ -290,7 +290,7 @@ def _preprocess_and_convert_inputs(
 
             if not self.channels_first:
                 x_tensor = torch.permute(x_tensor, (0, 3, 1, 2))
-            x_tensor /= norm_factor
+            x_tensor = x_tensor / norm_factor
 
             # Set gradients
             if not no_grad:
@@ -311,7 +311,7 @@ def _preprocess_and_convert_inputs(
 
             if not self.channels_first:
                 x_preprocessed = torch.permute(x_preprocessed, (0, 3, 1, 2))
-            x_preprocessed /= norm_factor
+            x_preprocessed = x_preprocessed / norm_factor
 
             # Set gradients
             if not no_grad: