diff --git a/art/estimators/object_detection/pytorch_faster_rcnn.py b/art/estimators/object_detection/pytorch_faster_rcnn.py index ce18fea694..ddd635b03f 100644 --- a/art/estimators/object_detection/pytorch_faster_rcnn.py +++ b/art/estimators/object_detection/pytorch_faster_rcnn.py @@ -38,17 +38,17 @@ class PyTorchFasterRCNN(PyTorchObjectDetector): """ - This class implements a model-specific object detector using Faster-RCNN and PyTorch following the input and output + This class implements a model-specific object detector using Faster R-CNN and PyTorch following the input and output formats of torchvision. """ def __init__( self, - model: Optional["torchvision.models.detection.fasterrcnn_resnet50_fpn"] = None, + model: Optional["torchvision.models.detection.FasterRCNN"] = None, input_shape: Tuple[int, ...] = (-1, -1, -1), optimizer: Optional["torch.optim.Optimizer"] = None, clip_values: Optional["CLIP_VALUES_TYPE"] = None, - channels_first: Optional[bool] = False, + channels_first: Optional[bool] = True, preprocessing_defences: Union["Preprocessor", List["Preprocessor"], None] = None, postprocessing_defences: Union["Postprocessor", List["Postprocessor"], None] = None, preprocessing: "PREPROCESSING_TYPE" = None, @@ -63,13 +63,13 @@ def __init__( """ Initialization. - :param model: Faster-RCNN model. The output of the model is `List[Dict[Tensor]]`, one for each input image. The - fields of the Dict are as follows: + :param model: Faster R-CNN model. The output of the model is `List[Dict[str, torch.Tensor]]`, one for + each input image. The fields of the Dict are as follows: - - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values \ - between 0 and H and 0 and W - - labels (Int64Tensor[N]): the predicted labels for each image - - scores (Tensor[N]): the scores or each prediction + - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and + 0 <= y1 < y2 <= H. + - labels [N]: the labels for each image. + - scores [N]: the scores of each prediction. :param input_shape: The shape of one input sample. :param optimizer: The optimizer for training the classifier. :param clip_values: Tuple of the form `(min, max)` of floats or `np.ndarray` representing the minimum and diff --git a/art/estimators/object_detection/pytorch_object_detector.py b/art/estimators/object_detection/pytorch_object_detector.py index d6317c10a5..4316d4e202 100644 --- a/art/estimators/object_detection/pytorch_object_detector.py +++ b/art/estimators/object_detection/pytorch_object_detector.py @@ -52,7 +52,7 @@ def __init__( input_shape: Tuple[int, ...] = (-1, -1, -1), optimizer: Optional["torch.optim.Optimizer"] = None, clip_values: Optional["CLIP_VALUES_TYPE"] = None, - channels_first: Optional[bool] = False, + channels_first: Optional[bool] = True, preprocessing_defences: Union["Preprocessor", List["Preprocessor"], None] = None, postprocessing_defences: Union["Postprocessor", List["Postprocessor"], None] = None, preprocessing: "PREPROCESSING_TYPE" = None, @@ -67,13 +67,13 @@ def __init__( """ Initialization. - :param model: Object detection model. The output of the model is `List[Dict[Tensor]]`, one for each input - image. The fields of the Dict are as follows: + :param model: Object detection model. The output of the model is `List[Dict[str, torch.Tensor]]`, one for + each input image. The fields of the Dict are as follows: - - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values - between 0 and H and 0 and W - - labels (Int64Tensor[N]): the predicted labels for each image - - scores (Tensor[N]): the scores or each prediction + - boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and + 0 <= y1 < y2 <= H. + - labels [N]: the labels for each image. + - scores [N]: the scores of each prediction. :param input_shape: The shape of one input sample. :param optimizer: The optimizer for training the classifier. :param clip_values: Tuple of the form `(min, max)` of floats or `np.ndarray` representing the minimum and @@ -215,7 +215,7 @@ def _preprocess_and_convert_inputs( if not self.channels_first: x_tensor = torch.permute(x_tensor, (0, 3, 1, 2)) - x_tensor /= norm_factor + x_tensor = x_tensor / norm_factor # Set gradients if not no_grad: @@ -236,7 +236,7 @@ def _preprocess_and_convert_inputs( if not self.channels_first: x_preprocessed = torch.permute(x_preprocessed, (0, 3, 1, 2)) - x_preprocessed /= norm_factor + x_preprocessed = x_preprocessed / norm_factor # Set gradients if not no_grad: diff --git a/art/estimators/object_detection/pytorch_yolo.py b/art/estimators/object_detection/pytorch_yolo.py index cdc4c72446..1570217d0a 100644 --- a/art/estimators/object_detection/pytorch_yolo.py +++ b/art/estimators/object_detection/pytorch_yolo.py @@ -103,8 +103,8 @@ def translate_labels_x1y1x2y2_to_xcycwh( labels[:, 2:6] = label_dict["boxes"] # normalize bounding boxes to [0, 1] - labels[:, 2:6:2] /= width - labels[:, 3:6:2] /= height + labels[:, 2:6:2] = labels[:, 2:6:2] / width + labels[:, 3:6:2] = labels[:, 3:6:2] / height # convert from x1y1x2y2 to xcycwh labels[:, 4] -= labels[:, 2] @@ -148,7 +148,7 @@ def __init__( """ Initialization. - :param model: Object detection model wrapped as demonstrated in examples/get_started_yolo.py. + :param model: YOLO v3 or v5 model wrapped as demonstrated in examples/get_started_yolo.py. The output of the model is `List[Dict[str, torch.Tensor]]`, one for each input image. The fields of the Dict are as follows: @@ -290,7 +290,7 @@ def _preprocess_and_convert_inputs( if not self.channels_first: x_tensor = torch.permute(x_tensor, (0, 3, 1, 2)) - x_tensor /= norm_factor + x_tensor = x_tensor / norm_factor # Set gradients if not no_grad: @@ -311,7 +311,7 @@ def _preprocess_and_convert_inputs( if not self.channels_first: x_preprocessed = torch.permute(x_preprocessed, (0, 3, 1, 2)) - x_preprocessed /= norm_factor + x_preprocessed = x_preprocessed / norm_factor # Set gradients if not no_grad: