From 24f4444b3bcb2b6365534bc85fcecd4de649acc2 Mon Sep 17 00:00:00 2001 From: Bernadette Bucher Date: Thu, 14 Dec 2023 06:08:48 -0500 Subject: [PATCH 1/5] mypy fixes, bug fixes, deprecated code removal --- vlfm/mapping/base_map.py | 3 +- vlfm/mapping/object_point_cloud_map.py | 4 +- vlfm/mapping/obstacle_map.py | 5 +- vlfm/mapping/traj_visualizer.py | 2 +- vlfm/policy/base_objectnav_policy.py | 4 +- vlfm/policy/reality_policies.py | 10 +- .../non_habitat_policy/nh_pointnav_policy.py | 8 +- .../policy/utils/non_habitat_policy/resnet.py | 1 - .../non_habitat_policy/rnn_state_encoder.py | 37 ++-- vlfm/policy/utils/pointnav_policy.py | 2 +- vlfm/reality/bdsw_nav_env.py | 3 +- vlfm/reality/objectnav_env.py | 6 +- vlfm/reality/pointnav_env.py | 8 +- vlfm/reality/robots/bdsw_robot.py | 4 +- vlfm/semexp_env/semexp_policy.py | 169 ------------------ vlfm/utils/episode_stats_logger.py | 2 +- vlfm/utils/geometry_utils.py | 4 +- vlfm/vlm/detections.py | 19 +- vlfm/vlm/server_wrapper.py | 8 +- 19 files changed, 69 insertions(+), 230 deletions(-) delete mode 100644 vlfm/semexp_env/semexp_policy.py diff --git a/vlfm/mapping/base_map.py b/vlfm/mapping/base_map.py index 62f2eeb..8247481 100644 --- a/vlfm/mapping/base_map.py +++ b/vlfm/mapping/base_map.py @@ -8,10 +8,9 @@ class BaseMap: - _confidence_mask: np.ndarray = None _camera_positions: List[np.ndarray] = [] _last_camera_yaw: float = 0.0 - _map_dtype: np.dtype = np.float32 + _map_dtype: np.dtype = np.dtype(np.float32) def __init__( self, size: int = 1000, pixels_per_meter: int = 20, *args: Any, **kwargs: Any diff --git a/vlfm/mapping/object_point_cloud_map.py b/vlfm/mapping/object_point_cloud_map.py index df38d54..eb0acaa 100644 --- a/vlfm/mapping/object_point_cloud_map.py +++ b/vlfm/mapping/object_point_cloud_map.py @@ -55,7 +55,7 @@ def update_map( else: # Mark all points of local_cloud whose distance from the camera is too far # as being out of range - within_range = local_cloud[:, 0] <= max_depth * 0.95 # 5% margin + within_range = (local_cloud[:, 0] <= max_depth * 0.95) * 1.0 # 5% margin # All values of 1 in within_range will be considered within range, and all # values of 0 will be considered out of range; these 0s need to be # assigned with a random number so that they can be identified later. @@ -147,7 +147,7 @@ def update_explored( def get_target_cloud(self, target_class: str) -> np.ndarray: target_cloud = self.clouds[target_class].copy() # Determine whether any points are within range - within_range_exists: bool = np.any(target_cloud[:, -1] == 1) + within_range_exists = np.any(target_cloud[:, -1] == 1) if within_range_exists: # Filter out all points that are not within range target_cloud = target_cloud[target_cloud[:, -1] == 1] diff --git a/vlfm/mapping/obstacle_map.py b/vlfm/mapping/obstacle_map.py index 79a2962..29840b2 100644 --- a/vlfm/mapping/obstacle_map.py +++ b/vlfm/mapping/obstacle_map.py @@ -2,6 +2,7 @@ import cv2 import numpy as np +from typing import Union, Any from frontier_exploration.frontier_detection import detect_frontier_waypoints from frontier_exploration.utils.fog_of_war import reveal_fog_of_war @@ -15,7 +16,7 @@ class ObstacleMap(BaseMap): and another representing the obstacles that the robot has seen so far. """ - _map_dtype: np.dtype = bool + _map_dtype: np.dtype = np.dtype(bool) _frontiers_px: np.ndarray = np.array([]) frontiers: np.ndarray = np.array([]) radius_padding_color: tuple = (100, 100, 100) @@ -52,7 +53,7 @@ def reset(self) -> None: def update_map( self, - depth: np.ndarray, + depth: Union[np.ndarray, Any], tf_camera_to_episodic: np.ndarray, min_depth: float, max_depth: float, diff --git a/vlfm/mapping/traj_visualizer.py b/vlfm/mapping/traj_visualizer.py index 4233c92..11a4a14 100644 --- a/vlfm/mapping/traj_visualizer.py +++ b/vlfm/mapping/traj_visualizer.py @@ -36,7 +36,7 @@ def draw_trajectory( img = self._draw_agent(img, camera_positions[-1], camera_yaw) return img - def _draw_path(self, img: np.ndarray, camera_positions: np.ndarray) -> np.ndarray: + def _draw_path(self, img: np.ndarray, camera_positions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: """Draws the path on the image and returns it""" if len(camera_positions) < 2: return img diff --git a/vlfm/policy/base_objectnav_policy.py b/vlfm/policy/base_objectnav_policy.py index a81da18..85a9c67 100644 --- a/vlfm/policy/base_objectnav_policy.py +++ b/vlfm/policy/base_objectnav_policy.py @@ -34,8 +34,8 @@ class BasePolicy: # type: ignore class BaseObjectNavPolicy(BasePolicy): _target_object: str = "" _policy_info: Dict[str, Any] = {} - _object_masks: np.ndarray = None # set by ._update_object_map() - _stop_action: Tensor = None # MUST BE SET BY SUBCLASS + _object_masks: Union[np.ndarray, Any] = None # set by ._update_object_map() + _stop_action: Union[Tensor, Any] = None # MUST BE SET BY SUBCLASS _observations_cache: Dict[str, Any] = {} _non_coco_caption = "" _load_yolo: bool = True diff --git a/vlfm/policy/reality_policies.py b/vlfm/policy/reality_policies.py index c650f01..c863fd5 100644 --- a/vlfm/policy/reality_policies.py +++ b/vlfm/policy/reality_policies.py @@ -1,7 +1,7 @@ # Copyright (c) 2023 Boston Dynamics AI Institute LLC. All rights reserved. from dataclasses import dataclass -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Dict, List, Tuple, Union, Optional import numpy as np import torch @@ -56,7 +56,7 @@ def from_config( def act( self: Union["RealityMixin", ITMPolicyV2], observations: Dict[str, Any], - rnn_hidden_states: Tensor, + rnn_hidden_states: Union[Tensor, Any], prev_actions: Any, masks: Tensor, deterministic: bool = False, @@ -77,10 +77,10 @@ def act( if self._done_initializing: angular = action[0][0].item() linear = action[0][1].item() - arm_yaw = -1 + arm_yaw = -1.0 else: - angular = 0 - linear = 0 + angular = 0.0 + linear = 0.0 arm_yaw = action[0][0].item() self._done_initializing = len(self._initial_yaws) == 0 diff --git a/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py b/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py index f0578e0..6373e96 100644 --- a/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py +++ b/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py @@ -49,9 +49,9 @@ class PointNavResNetNet(nn.Module): def __init__(self, discrete_actions: bool = False, no_fwd_dict: bool = False): super().__init__() if discrete_actions: - self.prev_action_embedding = nn.Embedding(4 + 1, 32) + self.prev_action_embedding_discrete = nn.Embedding(4 + 1, 32) else: - self.prev_action_embedding = nn.Linear( + self.prev_action_embedding_cont = nn.Linear( in_features=2, out_features=32, bias=True ) self.tgt_embeding = nn.Linear(in_features=3, out_features=32, bias=True) @@ -95,11 +95,11 @@ def forward( prev_actions = prev_actions.squeeze(-1) start_token = torch.zeros_like(prev_actions) # The mask means the previous action will be zero, an extra dummy action - prev_actions = self.prev_action_embedding( + prev_actions = self.prev_action_embedding_discrete( torch.where(masks.view(-1), prev_actions + 1, start_token) ) else: - prev_actions = self.prev_action_embedding(masks * prev_actions.float()) + prev_actions = self.prev_action_embedding_cont(masks * prev_actions.float()) x.append(prev_actions) diff --git a/vlfm/policy/utils/non_habitat_policy/resnet.py b/vlfm/policy/utils/non_habitat_policy/resnet.py index e94737e..73e03be 100755 --- a/vlfm/policy/utils/non_habitat_policy/resnet.py +++ b/vlfm/policy/utils/non_habitat_policy/resnet.py @@ -149,7 +149,6 @@ def _make_layer( def forward(self, x: Tensor) -> Tensor: x = self.conv1(x) x = self.maxpool(x) - x = cast(Tensor, x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) diff --git a/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py b/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py index 30e0ea2..7fa7469 100755 --- a/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py +++ b/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py @@ -6,7 +6,7 @@ # https://github.com/facebookresearch/habitat-lab/blob/main/habitat-baselines/habitat_baselines/rl/models/rnn_state_encoder.py # This is a filtered down version that only supports LSTM -from typing import Dict, Optional, Tuple +from typing import Dict, Optional, Tuple, Any import torch import torch.nn as nn @@ -20,6 +20,23 @@ class RNNStateEncoder(nn.Module): is that it takes an addition masks input that resets the hidden state between two adjacent timesteps to handle episodes ending in the middle of a rollout. """ + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + ): + super().__init__() + + self.num_recurrent_layers = num_layers * 2 + + self.rnn = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + ) + + self.layer_init() def layer_init(self) -> None: for name, param in self.rnn.named_parameters(): @@ -112,26 +129,18 @@ def __init__( hidden_size: int, num_layers: int = 1, ): - super().__init__() - - self.num_recurrent_layers = num_layers * 2 - - self.rnn = nn.LSTM( - input_size=input_size, - hidden_size=hidden_size, - num_layers=num_layers, - ) - - self.layer_init() + super().__init__(input_size, hidden_size, num_layers) + # Note: Type handling mypy errors in pytorch libraries prevent + # directly setting hidden_states type def pack_hidden( - self, hidden_states: Tuple[torch.Tensor, torch.Tensor] + self, hidden_states: Any # type is Tuple[torch.Tensor, torch.Tensor] ) -> torch.Tensor: return torch.cat(hidden_states, 0) def unpack_hidden( self, hidden_states: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> Any: # type is Tuple[torch.Tensor, torch.Tensor] lstm_states = torch.chunk(hidden_states.contiguous(), 2, 0) return (lstm_states[0], lstm_states[1]) diff --git a/vlfm/policy/utils/pointnav_policy.py b/vlfm/policy/utils/pointnav_policy.py index b05b55d..002de45 100644 --- a/vlfm/policy/utils/pointnav_policy.py +++ b/vlfm/policy/utils/pointnav_policy.py @@ -203,7 +203,7 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: def move_obs_to_device( - observations: Dict[str, Union[Tensor, np.ndarray]], + observations: Dict[str, Any], device: torch.device, unsqueeze: bool = False, ) -> Dict[str, Tensor]: diff --git a/vlfm/reality/bdsw_nav_env.py b/vlfm/reality/bdsw_nav_env.py index 3072ee6..93cd6d5 100644 --- a/vlfm/reality/bdsw_nav_env.py +++ b/vlfm/reality/bdsw_nav_env.py @@ -16,8 +16,9 @@ def run_env( done = False mask = torch.zeros(1, 1, device=policy.device, dtype=torch.bool) action = policy.act(observations, mask) + action_dict = {"rho_theta": action} while not done: - observations, _, done, info = env.step(action) + observations, _, done, info = env.step(action_dict) action = policy.act(observations, mask, deterministic=True) mask = torch.ones_like(mask) diff --git a/vlfm/reality/objectnav_env.py b/vlfm/reality/objectnav_env.py index a8b2264..78b9cfa 100644 --- a/vlfm/reality/objectnav_env.py +++ b/vlfm/reality/objectnav_env.py @@ -59,7 +59,7 @@ def __init__(self, max_gripper_cam_depth: float, *args: Any, **kwargs: Any) -> N self._vis_dir = f"{date_string}" os.makedirs(f"vis/{self._vis_dir}", exist_ok=True) - def reset(self, goal: Any, *args: Any, **kwargs: Any) -> Dict[str, np.ndarray]: + def reset(self, goal: Any, *args: Any, **kwargs: Any) -> Dict[str, Any]: assert isinstance(goal, str) self.target_object = goal # Transformation matrix from where the robot started to the global frame @@ -69,7 +69,7 @@ def reset(self, goal: Any, *args: Any, **kwargs: Any) -> Dict[str, np.ndarray]: self.episodic_start_yaw = self.robot.xy_yaw[1] return self._get_obs() - def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: + def step(self, action: Dict[str, Any]) -> Tuple[Dict, float, bool, Dict]: # Parent class only moves the base; if we want to move the gripper camera, # we need to do it here vis_imgs = [] @@ -117,7 +117,7 @@ def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: return self._get_obs(), 0.0, done, {} # not using info dict yet - def _get_obs(self) -> Dict[str, np.ndarray]: + def _get_obs(self) -> Dict[str, Any]: robot_xy, robot_heading = self._get_gps(), self._get_compass() nav_depth, obstacle_map_depths, value_map_rgbd, object_map_rgbd = ( self._get_camera_obs() diff --git a/vlfm/reality/pointnav_env.py b/vlfm/reality/pointnav_env.py index 191ee96..7d1bc45 100644 --- a/vlfm/reality/pointnav_env.py +++ b/vlfm/reality/pointnav_env.py @@ -54,7 +54,7 @@ def reset( self.goal = goal return self._get_obs() - def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: + def step(self, action: Dict[str, Any]) -> Tuple[Dict, float, bool, Dict]: if self._cmd_id is not None: cmd_status = 0 while cmd_status != 1: @@ -100,14 +100,14 @@ def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: self._num_steps += 1 return self._get_obs(), 0.0, done, {} # not using info dict yet - def _compute_velocities(self, action: Dict[str, np.ndarray]) -> Tuple[float, float]: + def _compute_velocities(self, action: Dict[str, Any]) -> Tuple[float, float]: ang_dist, lin_dist = self._compute_displacements(action) ang_vel = ang_dist / self._time_step lin_vel = lin_dist / self._time_step return ang_vel, lin_vel def _compute_displacements( - self, action: Dict[str, np.ndarray] + self, action: Dict[str, Any] ) -> Tuple[float, float]: displacements = [] for action_key, max_dist in ( @@ -120,7 +120,7 @@ def _compute_displacements( act_val = action[action_key] dist = np.clip(act_val, -1.0, 1.0) # clip to [-1, 1] dist *= max_dist # scale to max distance - displacements.append(dist) # convert to velocity + displacements.append(dist) # convert to velocities ang_dist, lin_dist = displacements return ang_dist, lin_dist diff --git a/vlfm/reality/robots/bdsw_robot.py b/vlfm/reality/robots/bdsw_robot.py index aa81e96..3b958e5 100644 --- a/vlfm/reality/robots/bdsw_robot.py +++ b/vlfm/reality/robots/bdsw_robot.py @@ -89,7 +89,7 @@ def open_gripper(self) -> None: """Opens the gripper""" self.spot.open_gripper() - def get_camera_data(self, srcs: List[str]) -> Dict[str, np.ndarray]: + def get_camera_data(self, srcs: List[str]) -> Dict[str, Dict[str, Any]]: """Returns a dict that maps each camera id to its image, focal lengths, and transform matrix (from camera to global frame). @@ -106,7 +106,7 @@ def get_camera_data(self, srcs: List[str]) -> Dict[str, np.ndarray]: } return imgs - def _camera_response_to_data(self, response: Any) -> Dict[str, np.ndarray]: + def _camera_response_to_data(self, response: Any) -> Dict[str, Any]: image: np.ndarray = image_response_to_cv2(response, reorient=False) fx: float = response.source.pinhole.intrinsics.focal_length.x fy: float = response.source.pinhole.intrinsics.focal_length.y diff --git a/vlfm/semexp_env/semexp_policy.py b/vlfm/semexp_env/semexp_policy.py deleted file mode 100644 index 3571219..0000000 --- a/vlfm/semexp_env/semexp_policy.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) 2023 Boston Dynamics AI Institute LLC. All rights reserved. - -from typing import Any, Dict, Tuple, Union - -import numpy as np -import torch -from depth_camera_filtering import filter_depth -from torch import Tensor - -from vlfm.mapping.obstacle_map import ObstacleMap -from vlfm.policy.base_objectnav_policy import BaseObjectNavPolicy -from vlfm.policy.itm_policy import ITMPolicy, ITMPolicyV2, ITMPolicyV3 -from vlfm.utils.geometry_utils import xyz_yaw_to_tf_matrix -from vlfm.vlm.grounding_dino import ObjectDetections - - -class TorchActionIDs: - STOP = torch.tensor([[0]], dtype=torch.long) - MOVE_FORWARD = torch.tensor([[1]], dtype=torch.long) - TURN_LEFT = torch.tensor([[2]], dtype=torch.long) - TURN_RIGHT = torch.tensor([[3]], dtype=torch.long) - - -class SemExpMixin: - """This Python mixin only contains code relevant for running a BaseObjectNavPolicy - explicitly within Habitat (vs. the real world, etc.) and will endow any parent class - (that is a subclass of BaseObjectNavPolicy) with the necessary methods to run in - Habitat. - """ - - _stop_action: Tensor = TorchActionIDs.STOP - _start_yaw: Union[float, None] = None # must be set by _reset() method - _observations_cache: Dict[str, Any] = {} - _policy_info: Dict[str, Any] = {} - - def __init__( - self: Union["SemExpMixin", BaseObjectNavPolicy], - camera_height: float, - min_depth: float, - max_depth: float, - camera_fov: float, - image_width: int, - *args: Any, - **kwargs: Any, - ) -> None: - super().__init__(*args, **kwargs) # type: ignore - assert self._compute_frontiers, "Must set self._compute_frontiers = True" - self._camera_height = camera_height - self._min_depth = min_depth - self._max_depth = max_depth - camera_fov_rad = np.deg2rad(camera_fov) - self._camera_fov = camera_fov_rad - self._fx = self._fy = image_width / (2 * np.tan(camera_fov_rad / 2)) - - self._compute_frontiers: bool = super()._compute_frontiers # type: ignore - - def act( - self: Union["SemExpMixin", BaseObjectNavPolicy], - observations: Dict[str, Union[Tensor, str]], - rnn_hidden_states: Any, - prev_actions: Any, - masks: Tensor, - deterministic: bool = True, - ) -> Tuple[Tensor, Dict[str, Any]]: - """Converts object ID to string name, returns action as PolicyActionData""" - parent_cls: BaseObjectNavPolicy = super() # type: ignore - try: - action, rnn_hidden_states = parent_cls.act( - observations, None, None, masks, deterministic - ) - except StopIteration: - action = self._stop_action - return action, self._policy_info - - def _initialize(self) -> Tensor: - """Turn left 30 degrees 12 times to get a 360 view at the beginning""" - self._done_initializing = not self._num_steps < 11 # type: ignore - return TorchActionIDs.TURN_LEFT - - def _reset(self) -> None: - parent_cls: BaseObjectNavPolicy = super() # type: ignore - parent_cls._reset() - self._start_yaw = None - - def _get_policy_info(self, detections: ObjectDetections) -> Dict[str, Any]: - """Get policy info for logging""" - parent_cls: BaseObjectNavPolicy = super() # type: ignore - info = parent_cls._get_policy_info(detections) - - if not self._visualize: # type: ignore - return info - - if self._start_yaw is None: - self._start_yaw = self._observations_cache["habitat_start_yaw"] - info["start_yaw"] = self._start_yaw - return info - - def _cache_observations( - self: Union["SemExpMixin", BaseObjectNavPolicy], observations: Dict[str, Any] - ) -> None: - """Caches the rgb, depth, and camera transform from the observations. - - Args: - observations (TensorDict): The observations from the current timestep. - """ - if len(self._observations_cache) > 0: - return - rgb = observations["rgb"][0].cpu().numpy() - depth = observations["depth"][0].cpu().numpy() - x, y = observations["gps"][0].cpu().numpy() - camera_yaw = observations["compass"][0].cpu().item() - depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None) - # Habitat GPS makes west negative, so flip y - camera_position = np.array([x, -y, self._camera_height]) - robot_xy = camera_position[:2] - tf_camera_to_episodic = xyz_yaw_to_tf_matrix(camera_position, camera_yaw) - - self._obstacle_map: ObstacleMap - self._obstacle_map.update_map( - depth, - tf_camera_to_episodic, - self._min_depth, - self._max_depth, - self._fx, - self._fy, - self._camera_fov, - ) - frontiers = self._obstacle_map.frontiers - self._obstacle_map.update_agent_traj(robot_xy, camera_yaw) - self._observations_cache = { - "frontier_sensor": frontiers, - "nav_depth": observations["depth"], # for pointnav - "robot_xy": robot_xy, - "robot_heading": camera_yaw, - "object_map_rgbd": [ - ( - rgb, - depth, - tf_camera_to_episodic, - self._min_depth, - self._max_depth, - self._fx, - self._fy, - ) - ], - "value_map_rgbd": [ - ( - rgb, - depth, - tf_camera_to_episodic, - self._min_depth, - self._max_depth, - self._camera_fov, - ) - ], - "habitat_start_yaw": observations["heading"][0].item(), - } - - -class SemExpITMPolicy(SemExpMixin, ITMPolicy): - pass - - -class SemExpITMPolicyV2(SemExpMixin, ITMPolicyV2): - pass - - -class SemExpITMPolicyV3(SemExpMixin, ITMPolicyV3): - pass diff --git a/vlfm/utils/episode_stats_logger.py b/vlfm/utils/episode_stats_logger.py index 4b31da9..892180c 100644 --- a/vlfm/utils/episode_stats_logger.py +++ b/vlfm/utils/episode_stats_logger.py @@ -77,7 +77,7 @@ def was_target_seen(infos: Dict[str, Any]) -> bool: explored_area = infos["top_down_map"]["fog_of_war_mask"] # Dilate the target_bboxes_mask by 10 pixels to add a margin of error target_bboxes_mask = cv2.dilate(target_bboxes_mask, np.ones((10, 10))) - target_explored = np.any(np.logical_and(explored_area, target_bboxes_mask)) + target_explored = bool(np.any(np.logical_and(explored_area, target_bboxes_mask))) return target_explored diff --git a/vlfm/utils/geometry_utils.py b/vlfm/utils/geometry_utils.py index e03e7e9..e600d2b 100644 --- a/vlfm/utils/geometry_utils.py +++ b/vlfm/utils/geometry_utils.py @@ -33,7 +33,7 @@ def rho_theta( rho = np.linalg.norm(local_goal) theta = np.arctan2(local_goal[1], local_goal[0]) - return rho, theta + return float(rho), float(theta) def get_rotation_matrix(angle: float, ndims: int = 2) -> np.ndarray: @@ -58,7 +58,7 @@ def get_rotation_matrix(angle: float, ndims: int = 2) -> np.ndarray: raise ValueError("ndims must be 2 or 3") -def wrap_heading(theta: Union[float, np.ndarray]) -> Union[float, np.ndarray]: +def wrap_heading(theta: float) -> float: """Wraps given angle to be between -pi and pi. Args: diff --git a/vlfm/vlm/detections.py b/vlfm/vlm/detections.py index 9dbae44..3ec1917 100644 --- a/vlfm/vlm/detections.py +++ b/vlfm/vlm/detections.py @@ -23,7 +23,7 @@ def __init__( boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str], - image_source: Optional[np.ndarray] = None, + image_source: Optional[np.ndarray], fmt: str = "cxcywh", ): self.image_source = image_source @@ -36,8 +36,8 @@ def __init__( self._annotated_frame: Optional[np.ndarray] = None @property - def annotated_frame(self) -> np.ndarray: - if self._annotated_frame is None: + def annotated_frame(self) -> Optional[np.ndarray]: + if self._annotated_frame is None and self.image_source is not None: self._annotated_frame = annotate( image_source=self.image_source, boxes=self.boxes, @@ -67,7 +67,7 @@ def filter_by_conf(self, conf_thresh: float) -> None: Args: conf_thresh (float): Confidence threshold to filter detections. """ - keep: torch.Tensor[bool] = torch.ge(self.logits, conf_thresh) # >= + keep: torch.Tensor = torch.ge(self.logits, conf_thresh) # >= self._filter(keep) def filter_by_class(self, classes: List[str]) -> None: @@ -76,7 +76,7 @@ def filter_by_class(self, classes: List[str]) -> None: Args: classes (List[str]): List of classes to keep. """ - keep: torch.Tensor[bool] = torch.tensor( + keep: torch.Tensor = torch.tensor( [p in classes for p in self.phrases], dtype=torch.bool ) self._filter(keep) @@ -215,15 +215,16 @@ def draw_bounding_box( if color is None: # Randomly choose a color from the rainbow colormap (so boxes aren't black) - single_pixel = np.array([[np.random.randint(0, 256)]], dtype=np.uint8) + single_pixel = np.array([[np.random.randint(0, 256)]]) single_pixel = cv2.applyColorMap(single_pixel, cv2.COLORMAP_RAINBOW) # reshape to a single dimensional array - color = single_pixel.reshape(3) + rand_color = single_pixel.reshape(3) + color = [int(c) for c in rand_color] # type: ignore else: # Convert RGB to BGR - color = color[::-1] - color = [int(c) for c in color] # type: ignore + bgr_color = color[::-1] + color = [int(c) for c in bgr_color] # type: ignore # Draw bounding box on image box_thickness = 2 diff --git a/vlfm/vlm/server_wrapper.py b/vlfm/vlm/server_wrapper.py index 32d2d00..344057a 100644 --- a/vlfm/vlm/server_wrapper.py +++ b/vlfm/vlm/server_wrapper.py @@ -37,8 +37,7 @@ def process_request() -> Dict[str, Any]: def bool_arr_to_str(arr: np.ndarray) -> str: """Converts a boolean array to a string.""" - packed = np.packbits(arr) - packed_str = base64.b64encode(packed).decode() + packed_str = base64.b64encode(arr.tobytes()).decode() return packed_str @@ -50,9 +49,8 @@ def str_to_bool_arr(s: str, shape: tuple) -> np.ndarray: # Convert bytes to np.uint8 array bytes_array = np.frombuffer(bytes_, dtype=np.uint8) - # Unpack the bytes back into a boolean array - unpacked = np.unpackbits(bytes_array) - unpacked = unpacked.reshape(shape) + # Reshape the data back into a boolean array + unpacked = bytes_array.reshape(shape) return unpacked From 853d84da43759b45277c90c847b4d56e88fcdfee Mon Sep 17 00:00:00 2001 From: Bernadette Bucher Date: Fri, 15 Dec 2023 13:04:28 -0500 Subject: [PATCH 2/5] latest pre-commit checks from bdai --- .pre-commit-config.yaml | 105 +++++++++++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 28 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 13392d6..1d45437 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,48 +1,97 @@ -# Copyright [2023] Boston Dynamics AI Institute, Inc. +# Copyright (c) 2023 Boston Dynamics AI Institute LLC. All rights reserved. repos: -- repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.0.263' - hooks: - - id: ruff - args: ['--fix', '--config', 'pyproject.toml'] +- repo: https://github.com/charliermarsh/ruff-pre-commit + # Ruff version. + rev: 'v0.1.0' + hooks: + - id: ruff + args: ['--fix', '--config', 'pyproject.toml'] # we want this to refer to `bdai/pyproject.toml` +- repo: https://github.com/psf/black + rev: 23.10.0 + hooks: + - id: black + language_version: python3.10 + args: ['--config', 'pyproject.toml'] # we want this to refer to `bdai/pyproject.toml` + verbose: true - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-yaml - args: ['--unsafe'] + args: ['--unsafe'] # details about the unsafe flag: + # https://github.com/pre-commit/pre-commit-hooks#check-yaml + # This is the solution proposed to prevent `check-yaml` from failing on custom tags: + # https://github.com/pre-commit/pre-commit-hooks/issues/701 - id: check-added-large-files args: ['--enforce-all', '--maxkb', '200'] + # For the `exclude` argument, see https://pre-commit.com/#regular-expressions + # Make sure to escape strings correctly to ensure literal matches, for example, using Python + # + # >>> print(re.escape("path/to-some/file.ext")) + # path/to\-some/file\.ext + # + # `path/to\-some/file\.ext` is the correct regex pattern to match `path/to-some/file.ext` literally. + # The inverse operation is more cumbersome: https://stackoverflow.com/a/54840284 + exclude: | + (?x)^( + docker/ros/web/static/novnc/vendor/browser\-es\-module\-loader/dist/babel\-worker\.js| + docker/ros/rootfs/usr/local/share/doro\-lxde\-wallpapers/bg.*\.jpg| + docker/ros/web/yarn\.lock| + src/modelzoo/detic/lvis_v1_train_cat_info\.json| + src/modelzoo/edge_grasp_serve/example_pc\.npy + )$ - id: check-toml - id: end-of-file-fixer - id: check-merge-conflict - id: check-executables-have-shebangs - id: check-shebang-scripts-are-executable - -- repo: https://github.com/psf/black - rev: 23.3.0 +- repo: https://github.com/ssciwr/clang-format-hook.git + rev: v12.0.1 # Use the sha / tag you want to point at hooks: - - id: black - language_version: python3.9 - args: ['--config', 'pyproject.toml'] - verbose: true - -- repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.2.0 + - id: clang-format + types_or: [c++, c, cuda] +- repo: https://github.com/cpplint/cpplint.git + rev: 1.6.1 hooks: - - id: mypy - pass_filenames: false - additional_dependencies: - - types-protobuf - - types-requests - - types-simplejson - - types-ujson - - types-PyYAML - - types-toml - - types-six + - id: cpplint + args: ['--quiet'] + exclude_types: [cuda] +#- repo: https://github.com/leoll2/copyright_notice_precommit +# rev: 0.1.1 +# hooks: +# - id: copyright-notice +# args: [--notice=copyright.txt] + +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.6.1 + hooks: + - id: mypy + pass_filenames: false + additional_dependencies: + - types-protobuf + - types-requests + - types-simplejson + - types-ujson + - types-PyYAML + - types-toml + - types-six - repo: https://github.com/jumanjihouse/pre-commit-hooks rev: 3.0.0 hooks: - id: forbid-binary + # Note: to exclude files, uncomment the next line and provide the regex to the associated file or files + exclude: > + (?x)^( + docker/ros/web/.*| + docker/ros/rootfs/.* + )$ +- repo: local + hooks: + - id: check-bdai-deps + name: check bdai dependency rules + entry: ./scripts/check_dependency_rules.sh + language: script + files: \.py$ + require_serial: true \ No newline at end of file From 77cf94da077f70ddc0cf34b72752f69827a98ede Mon Sep 17 00:00:00 2001 From: Katrina Ashton Date: Fri, 15 Dec 2023 14:20:37 -0500 Subject: [PATCH 3/5] Reality changes. Match laptop config for reality, copy over required file, allow loading in pointnav weight checkpoint used in previous experiments --- config/experiments/reality.yaml | 4 ++-- vlfm/policy/utils/pointnav_policy.py | 9 ++++++++- vlfm/reality/robots/frame_ids.py | 8 ++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 vlfm/reality/robots/frame_ids.py diff --git a/config/experiments/reality.yaml b/config/experiments/reality.yaml index 632851f..4c213e0 100644 --- a/config/experiments/reality.yaml +++ b/config/experiments/reality.yaml @@ -10,7 +10,7 @@ policy: name: "RealityITMPolicyV2" pointnav_policy_path: "data/pointnav_weights.pth" depth_image_shape: [212, 240] # height, width - pointnav_stop_radius: 1.2 + pointnav_stop_radius: 0.9 use_max_confidence: False object_map_erosion_size: 5 exploration_thresh: 0.0 @@ -21,7 +21,7 @@ policy: agent_radius: 0.2 env: - max_body_cam_depth: 3.5 + max_body_cam_depth: 2.5 max_gripper_cam_depth: 5.0 max_lin_dist: 0.2 max_ang_dist: 0.523599 diff --git a/vlfm/policy/utils/pointnav_policy.py b/vlfm/policy/utils/pointnav_policy.py index 002de45..9a23177 100644 --- a/vlfm/policy/utils/pointnav_policy.py +++ b/vlfm/policy/utils/pointnav_policy.py @@ -190,7 +190,14 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: else: ckpt_dict = torch.load(file_path, map_location="cpu") pointnav_policy = PointNavResNetTensorOutputPolicy() - current_state_dict = pointnav_policy.state_dict() + current_state_dict = pointnav_policy.state_dict() + #Let old checkpoints work with new code + if 'net.prev_action_embedding_cont.bias' not in ckpt_dict.keys(): + ckpt_dict['net.prev_action_embedding_cont.bias'] = ckpt_dict['net.prev_action_embedding.bias'] + if 'net.prev_action_embedding_cont.weights' not in ckpt_dict.keys(): + ckpt_dict['net.prev_action_embedding_cont.weight'] = ckpt_dict['net.prev_action_embedding.weight'] + + pointnav_policy.load_state_dict( {k: v for k, v in ckpt_dict.items() if k in current_state_dict} ) diff --git a/vlfm/reality/robots/frame_ids.py b/vlfm/reality/robots/frame_ids.py new file mode 100644 index 0000000..7da70ec --- /dev/null +++ b/vlfm/reality/robots/frame_ids.py @@ -0,0 +1,8 @@ +class SpotFrameIds: + BODY: str = "body" + FLAT_BODY: str = "flat_body" + GPE: str = "gpe" + HAND: str = "hand" + LINK_WR1: str = "link_wr1" + ODOM: str = "odom" + VISION: str = "vision" From 8365ad0d0384a5e2ad75dfb5bd2b0e45f92e34fa Mon Sep 17 00:00:00 2001 From: Katrina Ashton Date: Fri, 15 Dec 2023 16:22:43 -0500 Subject: [PATCH 4/5] Working in real world --- vlfm/policy/reality_policies.py | 38 +++++++++++++++------------------ vlfm/vlm/detections.py | 9 ++++---- vlfm/vlm/server_wrapper.py | 8 ++++--- 3 files changed, 26 insertions(+), 29 deletions(-) diff --git a/vlfm/policy/reality_policies.py b/vlfm/policy/reality_policies.py index 6711dff..00f028d 100644 --- a/vlfm/policy/reality_policies.py +++ b/vlfm/policy/reality_policies.py @@ -75,36 +75,32 @@ def act( # convert this numpy array to a dictionary with keys "angular" and "linear" so # that it can be passed to the Spot robot. if self._done_initializing: - angular = action[0][0].item() - linear = action[0][1].item() - arm_yaw = -1.0 + action_dict = { + "angular": action[0][0].item(), + "linear": action[0][1].item(), + "arm_yaw": -1, + "info": self._policy_info, + } else: - angular = 0.0 - linear = 0.0 - arm_yaw = action[0][0].item() + action_dict = { + "angular": 0, + "linear": 0, + "arm_yaw": action[0][0].item(), + "info": self._policy_info, + } + + if "rho_theta" in self._policy_info: + action_dict["rho_theta"] = self._policy_info["rho_theta"] self._done_initializing = len(self._initial_yaws) == 0 - action = torch.tensor([[angular, linear, arm_yaw]], dtype=torch.float32) + return action_dict - return action, rnn_hidden_states def get_action( self, observations: Dict[str, Any], masks: Tensor, deterministic: bool = True ) -> Dict[str, Any]: - actions, _ = self.act( - observations, None, None, masks, deterministic=deterministic - ) - action_dict = { - "angular": actions[0], - "linear": actions[1], - "arm_yaw": actions[2], - "info": self._policy_info, - } - if "rho_theta" in self._policy_info: - action_dict["rho_theta"] = self._policy_info["rho_theta"] - - return action_dict + return self.act(observations, None, None, masks, deterministic=deterministic) def _reset(self: Union["RealityMixin", ITMPolicyV2]) -> None: parent_cls: ITMPolicyV2 = super() # type: ignore diff --git a/vlfm/vlm/detections.py b/vlfm/vlm/detections.py index 3ec1917..9f87b1b 100644 --- a/vlfm/vlm/detections.py +++ b/vlfm/vlm/detections.py @@ -215,16 +215,15 @@ def draw_bounding_box( if color is None: # Randomly choose a color from the rainbow colormap (so boxes aren't black) - single_pixel = np.array([[np.random.randint(0, 256)]]) + single_pixel = np.array([[np.random.randint(0, 256)]], dtype=np.uint8) single_pixel = cv2.applyColorMap(single_pixel, cv2.COLORMAP_RAINBOW) # reshape to a single dimensional array - rand_color = single_pixel.reshape(3) - color = [int(c) for c in rand_color] # type: ignore + color = single_pixel.reshape(3) else: # Convert RGB to BGR - bgr_color = color[::-1] - color = [int(c) for c in bgr_color] # type: ignore + color = color[::-1] + color = [int(c) for c in color] # Draw bounding box on image box_thickness = 2 diff --git a/vlfm/vlm/server_wrapper.py b/vlfm/vlm/server_wrapper.py index 344057a..32d2d00 100644 --- a/vlfm/vlm/server_wrapper.py +++ b/vlfm/vlm/server_wrapper.py @@ -37,7 +37,8 @@ def process_request() -> Dict[str, Any]: def bool_arr_to_str(arr: np.ndarray) -> str: """Converts a boolean array to a string.""" - packed_str = base64.b64encode(arr.tobytes()).decode() + packed = np.packbits(arr) + packed_str = base64.b64encode(packed).decode() return packed_str @@ -49,8 +50,9 @@ def str_to_bool_arr(s: str, shape: tuple) -> np.ndarray: # Convert bytes to np.uint8 array bytes_array = np.frombuffer(bytes_, dtype=np.uint8) - # Reshape the data back into a boolean array - unpacked = bytes_array.reshape(shape) + # Unpack the bytes back into a boolean array + unpacked = np.unpackbits(bytes_array) + unpacked = unpacked.reshape(shape) return unpacked From 6bd973156436cda25408423d738ff364a12c529d Mon Sep 17 00:00:00 2001 From: Bernadette Bucher Date: Tue, 19 Dec 2023 12:12:55 -0500 Subject: [PATCH 5/5] updated precommit check, fixed some bugs and cleaned up code --- pyproject.toml | 26 ++++--- scripts/parse_jsons.py | 26 ++----- test/test_setup.py | 2 +- test/test_visualization.py | 2 +- vlfm/mapping/base_map.py | 17 ++--- vlfm/mapping/frontier_map.py | 13 +--- vlfm/mapping/object_point_cloud_map.py | 41 +++-------- vlfm/mapping/obstacle_map.py | 20 ++---- vlfm/mapping/traj_visualizer.py | 30 ++------ vlfm/mapping/value_map.py | 67 ++++++------------ vlfm/measurements/traveled_stairs.py | 4 +- vlfm/obs_transformers/resize.py | 25 ++----- vlfm/obs_transformers/utils.py | 4 +- vlfm/policy/action_replay_policy.py | 4 +- vlfm/policy/base_objectnav_policy.py | 68 +++++-------------- vlfm/policy/base_policy.py | 4 +- vlfm/policy/habitat_policies.py | 20 ++---- vlfm/policy/itm_policy.py | 28 ++------ vlfm/policy/reality_policies.py | 42 ++++-------- vlfm/policy/utils/acyclic_enforcer.py | 8 +-- .../non_habitat_policy/nh_pointnav_policy.py | 20 ++---- .../policy/utils/non_habitat_policy/resnet.py | 20 ++---- .../non_habitat_policy/rnn_state_encoder.py | 20 ++---- vlfm/policy/utils/pointnav_policy.py | 37 +++------- vlfm/reality/bdsw_nav_env.py | 4 +- vlfm/reality/objectnav_env.py | 16 ++--- vlfm/reality/pointnav_env.py | 16 ++--- vlfm/reality/robots/base_robot.py | 4 +- vlfm/reality/robots/bdsw_robot.py | 11 +-- vlfm/reality/run_bdsw_objnav_env.py | 8 +-- vlfm/semexp_env/eval.py | 18 ++--- vlfm/utils/episode_stats_logger.py | 4 +- vlfm/utils/geometry_utils.py | 33 +++------ vlfm/utils/habitat_visualizer.py | 16 ++--- vlfm/utils/img_utils.py | 47 ++++--------- vlfm/utils/log_saver.py | 4 +- vlfm/utils/visualization.py | 4 +- vlfm/utils/vlfm_trainer.py | 45 +++--------- vlfm/vlm/blip2.py | 8 +-- vlfm/vlm/blip2itm.py | 16 ++--- vlfm/vlm/detections.py | 8 +-- vlfm/vlm/fiber.py | 16 ++--- vlfm/vlm/grounding_dino.py | 20 ++---- vlfm/vlm/sam.py | 8 +-- vlfm/vlm/yolov7.py | 16 ++--- 45 files changed, 233 insertions(+), 637 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1074931..3acc9bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,14 +12,14 @@ authors = [ {name = "Naoki Yokoyama", email = "nyokoyama@theaiinstitute.com"}, ] readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" dependencies = [ "torch >= 1.10.1", "numpy >= 1.22.4", "flask >= 2.3.2", "seaborn >= 0.12.2", # required by yolov7 "open3d >= 0.17.0", - "transformers == 4.26.0", # higher versions break BLIP-2 + "transformers >= 4.28.1", # higher versions than 4.26.0 "break" BLIP-2 but need 4.28.1 for integration "salesforce-lavis >= 1.0.2", # for BLIP-2 "frontier_exploration @ git+https://github.com/naokiyokoyama/frontier_exploration.git", "mobile_sam @ git+https://github.com/ChaoningZhang/MobileSAM.git", @@ -49,8 +49,8 @@ reality = [ "Homepage" = "theaiinstitute.com" "GitHub" = "https://github.com/bdaiinstitute/vlfm" -[tool.setuptools] -packages = ["vlfm", "config"] +[tool.setuptools.packages.find] +where = ["vlfm"] [tool.ruff] # Enable pycodestyle (`E`), Pyflakes (`F`), and import sorting (`I`) @@ -92,8 +92,8 @@ line-length = 120 # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -# Assume Python 3.9. -target-version = "py39" +# Assume Python 3.10 +target-version = "py310" [tool.ruff.per-file-ignores] "__init__.py" = ["F401"] @@ -103,8 +103,8 @@ target-version = "py39" max-complexity = 10 [tool.black] -line-length = 88 -target-version = ['py39'] +line-length = 120 +target-version = ['py310'] include = '\.pyi?$' # `extend-exclude` is not honored when `black` is passed a file path explicitly, # as is typical when `black` is invoked via `pre-commit`. @@ -116,9 +116,13 @@ force-exclude = ''' preview = true +[tool.coverage.run] +relative_files = true + + # mypy configuration [tool.mypy] -python_version = "3.9" +python_version = "3.10" disallow_untyped_defs = true ignore_missing_imports = true explicit_package_bases = true @@ -127,5 +131,5 @@ strict_equality = true warn_unreachable = true warn_redundant_casts = true no_implicit_optional = true -files = ['vlfm'] -exclude = '^(docker|.*external|.*thirdparty|.*install|.*build|.*_experimental)/' +files = ['vlfm','test','scripts'] +exclude = "^(docker|.*external|.*thirdparty|.*install|.*build|.*_experimental|.*_pb2.py|.*_pb2_grpc.py)" diff --git a/scripts/parse_jsons.py b/scripts/parse_jsons.py index 949d2d9..8d31c71 100644 --- a/scripts/parse_jsons.py +++ b/scripts/parse_jsons.py @@ -46,9 +46,7 @@ def calculate_frequencies(failure_causes: List[str]) -> None: for cause, count in counter.most_common(): percentage = (count / total) * 100 # Add each row to the table - table.add_row( - [cause.replace("did_not_fail", "succeeded!"), count, f"{percentage:.2f}%"] - ) + table.add_row([cause.replace("did_not_fail", "succeeded!"), count, f"{percentage:.2f}%"]) print(table) @@ -60,10 +58,7 @@ def calculate_avg_performance(stats: List[Dict[str, Any]]) -> None: Args: stats (List[Dict[str, Any]]): A list of stats for each episode. """ - success, spl, soft_spl = [ - [episode.get(k, -1) for episode in stats] - for k in ["success", "spl", "soft_spl"] - ] + success, spl, soft_spl = [[episode.get(k, -1) for episode in stats] for k in ["success", "spl", "soft_spl"]] # Create a table with headers table = PrettyTable(["Metric", "Average"]) @@ -101,28 +96,23 @@ def calculate_avg_fail_per_category(stats: List[Dict[str, Any]]) -> None: table = PrettyTable(["Category", "Average Failure Rate"]) # Add each row to the table - for category, stats in sorted( + for category, c_stats in sorted( category_stats.items(), key=lambda x: x[1]["fail_count"], reverse=True, ): - avg_failure_rate = (stats["fail_count"] / stats["total_count"]) * 100 + avg_failure_rate = (c_stats["fail_count"] / c_stats["total_count"]) * 100 table.add_row( [ category, - ( - f"{avg_failure_rate:.2f}% ({stats['fail_count']}/" - f"{stats['total_count']})" - ), + f"{avg_failure_rate:.2f}% ({c_stats['fail_count']}/{c_stats['total_count']})", ] ) print(table) -def calculate_avg_fail_rate_per_category( - stats: List[Dict[str, Any]], failure_cause: str -) -> None: +def calculate_avg_fail_rate_per_category(stats: List[Dict[str, Any]], failure_cause: str) -> None: """ For each possible "target_object", count the number of times the agent failed due to the given failure cause. Then, sum the counts across all categories and use it to @@ -147,9 +137,7 @@ def calculate_avg_fail_rate_per_category( table = PrettyTable(["Category", f"% Occurrence for {failure_cause}"]) # Sort the categories by their failure count in descending order - sorted_categories = sorted( - category_to_fail_count.items(), key=lambda x: x[1], reverse=True - ) + sorted_categories = sorted(category_to_fail_count.items(), key=lambda x: x[1], reverse=True) # Add each row to the table for category, count in sorted_categories: diff --git a/test/test_setup.py b/test/test_setup.py index 3847d66..c4126d2 100644 --- a/test/test_setup.py +++ b/test/test_setup.py @@ -7,7 +7,7 @@ from vlfm.utils.generate_dummy_policy import save_dummy_policy -def test_load_and_save_config(): +def test_load_and_save_config() -> None: if not os.path.exists("data"): os.makedirs("data") diff --git a/test/test_visualization.py b/test/test_visualization.py index cb0c969..e984058 100644 --- a/test/test_visualization.py +++ b/test/test_visualization.py @@ -7,7 +7,7 @@ from vlfm.utils.visualization import generate_text_image -def test_visualization(): +def test_visualization() -> None: if not os.path.exists("build"): os.makedirs("build") diff --git a/vlfm/mapping/base_map.py b/vlfm/mapping/base_map.py index 8247481..37ffe90 100644 --- a/vlfm/mapping/base_map.py +++ b/vlfm/mapping/base_map.py @@ -12,9 +12,7 @@ class BaseMap: _last_camera_yaw: float = 0.0 _map_dtype: np.dtype = np.dtype(np.float32) - def __init__( - self, size: int = 1000, pixels_per_meter: int = 20, *args: Any, **kwargs: Any - ): + def __init__(self, size: int = 1000, pixels_per_meter: int = 20, *args: Any, **kwargs: Any): """ Args: size: The size of the map in pixels. @@ -23,16 +21,12 @@ def __init__( self.size = size self._map = np.zeros((size, size), dtype=self._map_dtype) self._episode_pixel_origin = np.array([size // 2, size // 2]) - self._traj_vis = TrajectoryVisualizer( - self._episode_pixel_origin, self.pixels_per_meter - ) + self._traj_vis = TrajectoryVisualizer(self._episode_pixel_origin, self.pixels_per_meter) def reset(self) -> None: self._map.fill(0) self._camera_positions = [] - self._traj_vis = TrajectoryVisualizer( - self._episode_pixel_origin, self.pixels_per_meter - ) + self._traj_vis = TrajectoryVisualizer(self._episode_pixel_origin, self.pixels_per_meter) def update_agent_traj(self, robot_xy: np.ndarray, robot_heading: float) -> None: self._camera_positions.append(robot_xy) @@ -47,10 +41,7 @@ def _xy_to_px(self, points: np.ndarray) -> np.ndarray: Returns: The array of (x, y) pixel coordinates. """ - px = ( - np.rint(points[:, ::-1] * self.pixels_per_meter) - + self._episode_pixel_origin - ) + px = np.rint(points[:, ::-1] * self.pixels_per_meter) + self._episode_pixel_origin px[:, 0] = self._map.shape[0] - px[:, 0] return px.astype(int) diff --git a/vlfm/mapping/frontier_map.py b/vlfm/mapping/frontier_map.py index 57ebe05..ec061ec 100644 --- a/vlfm/mapping/frontier_map.py +++ b/vlfm/mapping/frontier_map.py @@ -22,9 +22,7 @@ def __init__(self, encoding_type: str = "cosine"): def reset(self) -> None: self.frontiers = [] - def update( - self, frontier_locations: List[np.ndarray], curr_image: np.ndarray, text: str - ) -> None: + def update(self, frontier_locations: List[np.ndarray], curr_image: np.ndarray, text: str) -> None: """ Takes in a list of frontier coordinates and the current image observation from the robot. Any stored frontiers that are not present in the given list are @@ -41,19 +39,14 @@ def update( self.frontiers = [ frontier for frontier in self.frontiers - if any( - np.array_equal(frontier.xyz, location) - for location in frontier_locations - ) + if any(np.array_equal(frontier.xyz, location) for location in frontier_locations) ] # Add any frontiers that are not already stored. Set their image field to the # given image. cosine = None for location in frontier_locations: - if not any( - np.array_equal(frontier.xyz, location) for frontier in self.frontiers - ): + if not any(np.array_equal(frontier.xyz, location) for frontier in self.frontiers): if cosine is None: cosine = self._encode(curr_image, text) self.frontiers.append(Frontier(location, cosine)) diff --git a/vlfm/mapping/object_point_cloud_map.py b/vlfm/mapping/object_point_cloud_map.py index eb0acaa..994f614 100644 --- a/vlfm/mapping/object_point_cloud_map.py +++ b/vlfm/mapping/object_point_cloud_map.py @@ -41,9 +41,7 @@ def update_map( fy: float, ) -> None: """Updates the object map with the latest information from the agent.""" - local_cloud = self._extract_object_cloud( - depth_img, object_mask, min_depth, max_depth, fx, fy - ) + local_cloud = self._extract_object_cloud(depth_img, object_mask, min_depth, max_depth, fx, fy) if len(local_cloud) == 0: return @@ -72,15 +70,11 @@ def update_map( return if object_name in self.clouds: - self.clouds[object_name] = np.concatenate( - (self.clouds[object_name], global_cloud), axis=0 - ) + self.clouds[object_name] = np.concatenate((self.clouds[object_name], global_cloud), axis=0) else: self.clouds[object_name] = global_cloud - def get_best_object( - self, target_class: str, curr_position: np.ndarray - ) -> np.ndarray: + def get_best_object(self, target_class: str, curr_position: np.ndarray) -> np.ndarray: target_cloud = self.get_target_cloud(target_class) closest_point_2d = self._get_closest_point(target_cloud, curr_position)[:2] @@ -96,10 +90,7 @@ def get_best_object( if delta_dist < 0.1: # closest point is only slightly different return self.last_target_coord - elif ( - delta_dist < 0.5 - and np.linalg.norm(curr_position - closest_point_2d) > 2.0 - ): + elif delta_dist < 0.5 and np.linalg.norm(curr_position - closest_point_2d) > 2.0: # closest point is a little different, but the agent is too far for # the difference to matter much return self.last_target_coord @@ -108,9 +99,7 @@ def get_best_object( return self.last_target_coord - def update_explored( - self, tf_camera_to_episodic: np.ndarray, max_depth: float, cone_fov: float - ) -> None: + def update_explored(self, tf_camera_to_episodic: np.ndarray, max_depth: float, cone_fov: float) -> None: """ This method will remove all point clouds in self.clouds that were originally detected to be out-of-range, but are now within range. This is just a heuristic @@ -140,9 +129,7 @@ def update_explored( # Detection was originally within range continue # Remove all points from self.clouds[obj] that have the same range_id - self.clouds[obj] = self.clouds[obj][ - self.clouds[obj][..., -1] != range_id - ] + self.clouds[obj] = self.clouds[obj][self.clouds[obj][..., -1] != range_id] def get_target_cloud(self, target_class: str) -> np.ndarray: target_cloud = self.clouds[target_class].copy() @@ -163,9 +150,7 @@ def _extract_object_cloud( fy: float, ) -> np.ndarray: final_mask = object_mask * 255 - final_mask = cv2.erode( # type: ignore - final_mask, None, iterations=self._erosion_size - ) + final_mask = cv2.erode(final_mask, None, iterations=self._erosion_size) # type: ignore valid_depth = depth.copy() valid_depth[valid_depth == 0] = 1 # set all holes (0) to just be far (1) @@ -177,15 +162,11 @@ def _extract_object_cloud( return cloud - def _get_closest_point( - self, cloud: np.ndarray, curr_position: np.ndarray - ) -> np.ndarray: + def _get_closest_point(self, cloud: np.ndarray, curr_position: np.ndarray) -> np.ndarray: ndim = curr_position.shape[0] if self.use_dbscan: # Return the point that is closest to curr_position, which is 2D - closest_point = cloud[ - np.argmin(np.linalg.norm(cloud[:, :ndim] - curr_position, axis=1)) - ] + closest_point = cloud[np.argmin(np.linalg.norm(cloud[:, :ndim] - curr_position, axis=1))] else: # Calculate the Euclidean distance from each point to the reference point if ndim == 2: @@ -208,9 +189,7 @@ def _get_closest_point( return closest_point -def open3d_dbscan_filtering( - points: np.ndarray, eps: float = 0.2, min_points: int = 100 -) -> np.ndarray: +def open3d_dbscan_filtering(points: np.ndarray, eps: float = 0.2, min_points: int = 100) -> np.ndarray: pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(points) diff --git a/vlfm/mapping/obstacle_map.py b/vlfm/mapping/obstacle_map.py index c840347..705a4ff 100644 --- a/vlfm/mapping/obstacle_map.py +++ b/vlfm/mapping/obstacle_map.py @@ -92,12 +92,8 @@ def update_map( scaled_depth = filled_depth * (max_depth - min_depth) + min_depth mask = scaled_depth < max_depth point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, fx, fy) - point_cloud_episodic_frame = transform_points( - tf_camera_to_episodic, point_cloud_camera_frame - ) - obstacle_cloud = filter_points_by_height( - point_cloud_episodic_frame, self._min_height, self._max_height - ) + point_cloud_episodic_frame = transform_points(tf_camera_to_episodic, point_cloud_camera_frame) + obstacle_cloud = filter_points_by_height(point_cloud_episodic_frame, self._min_height, self._max_height) # Populate topdown map with obstacle locations xy_points = obstacle_cloud[:, :2] @@ -126,9 +122,7 @@ def update_map( fov=np.rad2deg(topdown_fov), max_line_len=max_depth * self.pixels_per_meter, ) - new_explored_area = cv2.dilate( - new_explored_area, np.ones((3, 3), np.uint8), iterations=1 - ) + new_explored_area = cv2.dilate(new_explored_area, np.ones((3, 3), np.uint8), iterations=1) self.explored_area[new_explored_area > 0] = 1 self.explored_area[self._navigable_map == 0] = 0 contours, _ = cv2.findContours( @@ -140,9 +134,7 @@ def update_map( min_dist = np.inf best_idx = 0 for idx, cnt in enumerate(contours): - dist = cv2.pointPolygonTest( - cnt, tuple([int(i) for i in agent_pixel_location]), True - ) + dist = cv2.pointPolygonTest(cnt, tuple([int(i) for i in agent_pixel_location]), True) if dist >= 0: best_idx = idx break @@ -201,7 +193,5 @@ def visualize(self) -> np.ndarray: return vis_img -def filter_points_by_height( - points: np.ndarray, min_height: float, max_height: float -) -> np.ndarray: +def filter_points_by_height(points: np.ndarray, min_height: float, max_height: float) -> np.ndarray: return points[(points[:, 2] >= min_height) & (points[:, 2] <= max_height)] diff --git a/vlfm/mapping/traj_visualizer.py b/vlfm/mapping/traj_visualizer.py index 661c0cb..b7e2df7 100644 --- a/vlfm/mapping/traj_visualizer.py +++ b/vlfm/mapping/traj_visualizer.py @@ -36,9 +36,7 @@ def draw_trajectory( img = self._draw_agent(img, camera_positions[-1], camera_yaw) return img - def _draw_path( - self, img: np.ndarray, camera_positions: Union[np.ndarray, List[np.ndarray]] - ) -> np.ndarray: + def _draw_path(self, img: np.ndarray, camera_positions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: """Draws the path on the image and returns it""" if len(camera_positions) < 2: return img @@ -48,9 +46,7 @@ def _draw_path( path_mask = np.zeros(img.shape[:2], dtype=np.uint8) for i in range(self._num_drawn_points - 1, len(camera_positions) - 1): - path_mask = self._draw_line( - path_mask, camera_positions[i], camera_positions[i + 1] - ) + path_mask = self._draw_line(path_mask, camera_positions[i], camera_positions[i + 1]) img[path_mask == 255] = self.path_color @@ -59,9 +55,7 @@ def _draw_path( return img - def _draw_line( - self, img: np.ndarray, pt_a: np.ndarray, pt_b: np.ndarray - ) -> np.ndarray: + def _draw_line(self, img: np.ndarray, pt_a: np.ndarray, pt_b: np.ndarray) -> np.ndarray: """Draws a line between two points and returns it""" # Convert metric coordinates to pixel coordinates px_a = self._metric_to_pixel(pt_a) @@ -80,9 +74,7 @@ def _draw_line( return img - def _draw_agent( - self, img: np.ndarray, camera_position: np.ndarray, camera_yaw: float - ) -> np.ndarray: + def _draw_agent(self, img: np.ndarray, camera_position: np.ndarray, camera_yaw: float) -> np.ndarray: """Draws the agent on the image and returns it""" px_position = self._metric_to_pixel(camera_position) cv2.circle( @@ -93,14 +85,8 @@ def _draw_agent( -1, ) heading_end_pt = ( - int( - px_position[0] - - self.agent_line_length * self.scale_factor * np.cos(camera_yaw) - ), - int( - px_position[1] - - self.agent_line_length * self.scale_factor * np.sin(camera_yaw) - ), + int(px_position[0] - self.agent_line_length * self.scale_factor * np.cos(camera_yaw)), + int(px_position[1] - self.agent_line_length * self.scale_factor * np.sin(camera_yaw)), ) cv2.line( img, @@ -112,9 +98,7 @@ def _draw_agent( return img - def draw_circle( - self, img: np.ndarray, position: np.ndarray, **kwargs: Any - ) -> np.ndarray: + def draw_circle(self, img: np.ndarray, position: np.ndarray, **kwargs: Any) -> np.ndarray: """Draws the point as a circle on the image and returns it""" px_position = self._metric_to_pixel(position) cv2.circle(img, tuple(px_position[::-1]), **kwargs) diff --git a/vlfm/mapping/value_map.py b/vlfm/mapping/value_map.py index cfc1e90..01c5d3d 100644 --- a/vlfm/mapping/value_map.py +++ b/vlfm/mapping/value_map.py @@ -39,6 +39,7 @@ class ValueMap(BaseMap): _last_camera_yaw: float = 0.0 _min_confidence: float = 0.25 _decision_threshold: float = 0.35 + _map: np.ndarray def __init__( self, @@ -75,9 +76,7 @@ def __init__( if RECORDING: if osp.isdir(RECORDING_DIR): - warnings.warn( - f"Recording directory {RECORDING_DIR} already exists. Deleting it." - ) + warnings.warn(f"Recording directory {RECORDING_DIR} already exists. Deleting it.") shutil.rmtree(RECORDING_DIR) os.mkdir(RECORDING_DIR) # Dump all args to a file @@ -119,14 +118,11 @@ def update_map( max_depth: The maximum depth value in meters. fov: The field of view of the camera in RADIANS. """ - assert len(values) == self._value_channels, ( - "Incorrect number of values given " - f"({len(values)}). Expected {self._value_channels}." - ) + assert ( + len(values) == self._value_channels + ), f"Incorrect number of values given ({len(values)}). Expected {self._value_channels}." - curr_map = self._localize_new_data( - depth, tf_camera_to_episodic, min_depth, max_depth, fov - ) + curr_map = self._localize_new_data(depth, tf_camera_to_episodic, min_depth, max_depth, fov) # Fuse the new data with the existing data self._fuse_new_data(curr_map, values) @@ -180,9 +176,7 @@ def get_value(point: np.ndarray) -> Union[float, Tuple[float, ...]]: values = [get_value(point) for point in waypoints] if self._value_channels > 1: - assert ( - reduce_fn is not None - ), "Must provide a reduction function when using multiple value channels." + assert reduce_fn is not None, "Must provide a reduction function when using multiple value channels." values = reduce_fn(values) # Use np.argsort to get the indices of the sorted values @@ -224,9 +218,7 @@ def visualize( return map_img - def _process_local_data( - self, depth: np.ndarray, fov: float, min_depth: float, max_depth: float - ) -> np.ndarray: + def _process_local_data(self, depth: np.ndarray, fov: float, min_depth: float, max_depth: float) -> np.ndarray: """Using the FOV and depth, return the visible portion of the FOV. Args: @@ -277,16 +269,10 @@ def _process_local_data( if not os.path.exists("visualizations"): os.makedirs("visualizations") # Expand the depth_row back into a full image - depth_row_full = np.repeat( - depth_row.reshape(1, -1), depth.shape[0], axis=0 - ) + depth_row_full = np.repeat(depth_row.reshape(1, -1), depth.shape[0], axis=0) # Stack the depth images with the visible mask - depth_rgb = cv2.cvtColor( - (depth * 255).astype(np.uint8), cv2.COLOR_GRAY2RGB - ) - depth_row_full = cv2.cvtColor( - (depth_row_full * 255).astype(np.uint8), cv2.COLOR_GRAY2RGB - ) + depth_rgb = cv2.cvtColor((depth * 255).astype(np.uint8), cv2.COLOR_GRAY2RGB) + depth_row_full = cv2.cvtColor((depth_row_full * 255).astype(np.uint8), cv2.COLOR_GRAY2RGB) vis = np.flipud(vis) new_width = int(vis.shape[1] * (depth_rgb.shape[0] / vis.shape[0])) vis_resized = cv2.resize(vis, (new_width, depth_rgb.shape[0])) @@ -376,10 +362,9 @@ def _fuse_new_data(self, new_map: np.ndarray, values: np.ndarray) -> None: 0 and 1, with 1 being the most confident. values: The values attributed to the new portion of the map. """ - assert len(values) == self._value_channels, ( - "Incorrect number of values given " - f"({len(values)}). Expected {self._value_channels}." - ) + assert ( + len(values) == self._value_channels + ), f"Incorrect number of values given ({len(values)}). Expected {self._value_channels}." if self._obstacle_map is not None: # If an obstacle map is provided, we will use it to mask out the @@ -405,16 +390,12 @@ def _fuse_new_data(self, new_map: np.ndarray, values: np.ndarray) -> None: self._map[self._map > 0] = 1 new_map[new_map > 0] = 1 else: - assert ( - self._fusion_type == "default" - ), f"Unknown fusion type {self._fusion_type}" + assert self._fusion_type == "default", f"Unknown fusion type {self._fusion_type}" # Any values in the given map that are less confident than # self._decision_threshold AND less than the new_map in the existing map # will be silenced into 0s - new_map_mask = np.logical_and( - new_map < self._decision_threshold, new_map < self._map - ) + new_map_mask = np.logical_and(new_map < self._decision_threshold, new_map < self._map) new_map[new_map_mask] = 0 if self._use_max_confidence: @@ -436,16 +417,10 @@ def _fuse_new_data(self, new_map: np.ndarray, values: np.ndarray) -> None: weight_1 = self._map / confidence_denominator weight_2 = new_map / confidence_denominator - weight_1_channeled = np.repeat( - np.expand_dims(weight_1, axis=2), self._value_channels, axis=2 - ) - weight_2_channeled = np.repeat( - np.expand_dims(weight_2, axis=2), self._value_channels, axis=2 - ) + weight_1_channeled = np.repeat(np.expand_dims(weight_1, axis=2), self._value_channels, axis=2) + weight_2_channeled = np.repeat(np.expand_dims(weight_2, axis=2), self._value_channels, axis=2) - self._value_map = ( - self._value_map * weight_1_channeled + values * weight_2_channeled - ) + self._value_map = self._value_map * weight_1_channeled + values * weight_2_channeled self._map = self._map * weight_1 + new_map * weight_2 # Because confidence_denominator can have 0 values, any nans in either the @@ -454,9 +429,7 @@ def _fuse_new_data(self, new_map: np.ndarray, values: np.ndarray) -> None: self._map = np.nan_to_num(self._map) -def remap( - value: float, from_low: float, from_high: float, to_low: float, to_high: float -) -> float: +def remap(value: float, from_low: float, from_high: float, to_low: float, to_high: float) -> float: """Maps a value from one range to another. Args: diff --git a/vlfm/measurements/traveled_stairs.py b/vlfm/measurements/traveled_stairs.py index fb8fd5f..d1cef14 100644 --- a/vlfm/measurements/traveled_stairs.py +++ b/vlfm/measurements/traveled_stairs.py @@ -18,9 +18,7 @@ class TraveledStairs(Measure): cls_uuid: str = "traveled_stairs" - def __init__( - self, sim: Simulator, config: DictConfig, *args: Any, **kwargs: Any - ) -> None: + def __init__(self, sim: Simulator, config: DictConfig, *args: Any, **kwargs: Any) -> None: self._sim = sim self._config = config self._history: List[np.ndarray] = [] diff --git a/vlfm/obs_transformers/resize.py b/vlfm/obs_transformers/resize.py index 0493c2a..eb2ea5d 100644 --- a/vlfm/obs_transformers/resize.py +++ b/vlfm/obs_transformers/resize.py @@ -41,30 +41,19 @@ def __init__( self.trans_keys: Tuple[str, ...] = trans_keys self.semantic_key = semantic_key - def transform_observation_space( - self, observation_space: spaces.Dict - ) -> spaces.Dict: + def transform_observation_space(self, observation_space: spaces.Dict) -> spaces.Dict: observation_space = copy.deepcopy(observation_space) for key in observation_space.spaces: if key in self.trans_keys: # In the observation space dict, the channels are always last - h, w = get_image_height_width( - observation_space.spaces[key], channels_last=True - ) + h, w = get_image_height_width(observation_space.spaces[key], channels_last=True) if self._size == (h, w): continue - logger.info( - "Resizing observation of %s: from %s to %s" - % (key, (h, w), self._size) - ) - observation_space.spaces[key] = overwrite_gym_box_shape( - observation_space.spaces[key], self._size - ) + logger.info("Resizing observation of %s: from %s to %s" % (key, (h, w), self._size)) + observation_space.spaces[key] = overwrite_gym_box_shape(observation_space.spaces[key], self._size) return observation_space - def _transform_obs( - self, obs: torch.Tensor, interpolation_mode: str - ) -> torch.Tensor: + def _transform_obs(self, obs: torch.Tensor, interpolation_mode: str) -> torch.Tensor: return image_resize( obs, self._size, @@ -79,9 +68,7 @@ def forward(self, observations: Dict[str, torch.Tensor]) -> Dict[str, torch.Tens interpolation_mode = "area" if self.semantic_key in sensor: interpolation_mode = "nearest" - observations[sensor] = self._transform_obs( - observations[sensor], interpolation_mode - ) + observations[sensor] = self._transform_obs(observations[sensor], interpolation_mode) return observations @classmethod diff --git a/vlfm/obs_transformers/utils.py b/vlfm/obs_transformers/utils.py index c304b4c..13f759b 100644 --- a/vlfm/obs_transformers/utils.py +++ b/vlfm/obs_transformers/utils.py @@ -35,9 +35,7 @@ def image_resize( # NDHWC -> NDCHW img = img.permute(0, 1, 4, 2, 3) - img = torch.nn.functional.interpolate( - img.float(), size=size, mode=interpolation_mode - ).to(dtype=img.dtype) + img = torch.nn.functional.interpolate(img.float(), size=size, mode=interpolation_mode).to(dtype=img.dtype) if channels_last: if len(img.shape) == 4: # NCHW -> NHWC diff --git a/vlfm/policy/action_replay_policy.py b/vlfm/policy/action_replay_policy.py index 59092da..dbf1653 100644 --- a/vlfm/policy/action_replay_policy.py +++ b/vlfm/policy/action_replay_policy.py @@ -33,9 +33,7 @@ def __init__( **kwargs: Any, ) -> None: super().__init__() - assert ( - "VLFM_RECORD_ACTIONS_DIR" in os.environ - ), "Must set VLFM_RECORD_ACTIONS_DIR" + assert "VLFM_RECORD_ACTIONS_DIR" in os.environ, "Must set VLFM_RECORD_ACTIONS_DIR" self._dir = os.environ["VLFM_RECORD_ACTIONS_DIR"] filepath = os.path.join(self._dir, "actions.txt") with open(filepath, "r") as f: diff --git a/vlfm/policy/base_objectnav_policy.py b/vlfm/policy/base_objectnav_policy.py index e6953f0..196114b 100644 --- a/vlfm/policy/base_objectnav_policy.py +++ b/vlfm/policy/base_objectnav_policy.py @@ -61,22 +61,14 @@ def __init__( **kwargs: Any, ) -> None: super().__init__() - self._object_detector = GroundingDINOClient( - port=int(os.environ.get("GROUNDING_DINO_PORT", "12181")) - ) - self._coco_object_detector = YOLOv7Client( - port=int(os.environ.get("YOLOV7_PORT", "12184")) - ) - self._mobile_sam = MobileSAMClient( - port=int(os.environ.get("SAM_PORT", "12183")) - ) + self._object_detector = GroundingDINOClient(port=int(os.environ.get("GROUNDING_DINO_PORT", "12181"))) + self._coco_object_detector = YOLOv7Client(port=int(os.environ.get("YOLOV7_PORT", "12184"))) + self._mobile_sam = MobileSAMClient(port=int(os.environ.get("SAM_PORT", "12183"))) self._use_vqa = use_vqa if use_vqa: self._vqa = BLIP2Client(port=int(os.environ.get("BLIP2_PORT", "12185"))) self._pointnav_policy = WrappedPointNavResNetPolicy(pointnav_policy_path) - self._object_map: ObjectPointCloudMap = ObjectPointCloudMap( - erosion_size=object_map_erosion_size - ) + self._object_map: ObjectPointCloudMap = ObjectPointCloudMap(erosion_size=object_map_erosion_size) self._depth_image_shape = tuple(depth_image_shape) self._pointnav_stop_radius = pointnav_stop_radius self._visualize = visualize @@ -176,9 +168,7 @@ def _initialize(self) -> Tensor: def _explore(self, observations: "TensorDict") -> Tensor: raise NotImplementedError - def _get_target_object_location( - self, position: np.ndarray - ) -> Union[None, np.ndarray]: + def _get_target_object_location(self, position: np.ndarray) -> Union[None, np.ndarray]: if self._object_map.has_object(self._target_object): return self._object_map.get_best_object(self._target_object, position) else: @@ -207,30 +197,20 @@ def _get_policy_info(self, detections: ObjectDetections) -> Dict[str, Any]: return policy_info annotated_depth = self._observations_cache["object_map_rgbd"][0][1] * 255 - annotated_depth = cv2.cvtColor( - annotated_depth.astype(np.uint8), cv2.COLOR_GRAY2RGB - ) + annotated_depth = cv2.cvtColor(annotated_depth.astype(np.uint8), cv2.COLOR_GRAY2RGB) if self._object_masks.sum() > 0: # If self._object_masks isn't all zero, get the object segmentations and # draw them on the rgb and depth images - contours, _ = cv2.findContours( - self._object_masks, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE - ) - annotated_rgb = cv2.drawContours( - detections.annotated_frame, contours, -1, (255, 0, 0), 2 - ) - annotated_depth = cv2.drawContours( - annotated_depth, contours, -1, (255, 0, 0), 2 - ) + contours, _ = cv2.findContours(self._object_masks, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + annotated_rgb = cv2.drawContours(detections.annotated_frame, contours, -1, (255, 0, 0), 2) + annotated_depth = cv2.drawContours(annotated_depth, contours, -1, (255, 0, 0), 2) else: annotated_rgb = self._observations_cache["object_map_rgbd"][0][0] policy_info["annotated_rgb"] = annotated_rgb policy_info["annotated_depth"] = annotated_depth if self._compute_frontiers: - policy_info["obstacle_map"] = cv2.cvtColor( - self._obstacle_map.visualize(), cv2.COLOR_BGR2RGB - ) + policy_info["obstacle_map"] = cv2.cvtColor(self._obstacle_map.visualize(), cv2.COLOR_BGR2RGB) if "DEBUG_INFO" in os.environ: policy_info["render_below_images"].append("debug") @@ -249,16 +229,12 @@ def _get_object_detections(self, img: np.ndarray) -> ObjectDetections: else self._object_detector.predict(img, caption=self._non_coco_caption) ) detections.filter_by_class(target_classes) - det_conf_threshold = ( - self._coco_threshold if has_coco else self._non_coco_threshold - ) + det_conf_threshold = self._coco_threshold if has_coco else self._non_coco_threshold detections.filter_by_conf(det_conf_threshold) if has_coco and has_non_coco and detections.num_detections == 0: # Retry with non-coco object detector - detections = self._object_detector.predict( - img, caption=self._non_coco_caption - ) + detections = self._object_detector.predict(img, caption=self._non_coco_caption) detections.filter_by_class(target_classes) detections.filter_by_conf(self._non_coco_threshold) @@ -285,9 +261,7 @@ def _pointnav(self, goal: np.ndarray, stop: bool = False) -> Tensor: robot_xy = self._observations_cache["robot_xy"] heading = self._observations_cache["robot_heading"] rho, theta = rho_theta(robot_xy, heading, goal) - rho_theta_tensor = torch.tensor( - [[rho, theta]], device="cuda", dtype=torch.float32 - ) + rho_theta_tensor = torch.tensor([[rho, theta]], device="cuda", dtype=torch.float32) obs_pointnav = { "depth": image_resize( self._observations_cache["nav_depth"], @@ -343,21 +317,15 @@ def _update_object_map( obs[1] = depth self._observations_cache["object_map_rgbd"][0] = tuple(obs) for idx in range(len(detections.logits)): - bbox_denorm = detections.boxes[idx] * np.array( - [width, height, width, height] - ) + bbox_denorm = detections.boxes[idx] * np.array([width, height, width, height]) object_mask = self._mobile_sam.segment_bbox(rgb, bbox_denorm.tolist()) # If we are using vqa, then use the BLIP2 model to visually confirm whether # the contours are actually correct. if self._use_vqa: - contours, _ = cv2.findContours( - object_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE - ) - annotated_rgb = cv2.drawContours( - rgb.copy(), contours, -1, (255, 0, 0), 2 - ) + contours, _ = cv2.findContours(object_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + annotated_rgb = cv2.drawContours(rgb.copy(), contours, -1, (255, 0, 0), 2) question = f"Question: {self._vqa_prompt}" if not detections.phrases[idx].endswith("ing"): question += "a " @@ -391,9 +359,7 @@ def _cache_observations(self, observations: "TensorDict") -> None: """ raise NotImplementedError - def _infer_depth( - self, rgb: np.ndarray, min_depth: float, max_depth: float - ) -> np.ndarray: + def _infer_depth(self, rgb: np.ndarray, min_depth: float, max_depth: float) -> np.ndarray: """Infers the depth image from the rgb image. Args: diff --git a/vlfm/policy/base_policy.py b/vlfm/policy/base_policy.py index b9c6db8..6819ab4 100644 --- a/vlfm/policy/base_policy.py +++ b/vlfm/policy/base_policy.py @@ -54,9 +54,7 @@ def parameters(self) -> Generator: # Save a dummy state_dict using torch.save. This is useful for generating a pth file # that can be used to load other policies that don't even read from checkpoints, # even though habitat requires a checkpoint to be loaded. - config = get_config( - "habitat-lab/habitat-baselines/habitat_baselines/config/pointnav/ppo_pointnav_example.yaml" - ) + config = get_config("habitat-lab/habitat-baselines/habitat_baselines/config/pointnav/ppo_pointnav_example.yaml") dummy_dict = { "config": config, "extra_state": {"step": 0}, diff --git a/vlfm/policy/habitat_policies.py b/vlfm/policy/habitat_policies.py index f35ed75..f564f2e 100644 --- a/vlfm/policy/habitat_policies.py +++ b/vlfm/policy/habitat_policies.py @@ -92,13 +92,9 @@ def __init__( self._dataset_type = dataset_type @classmethod - def from_config( - cls, config: DictConfig, *args_unused: Any, **kwargs_unused: Any - ) -> "HabitatMixin": + def from_config(cls, config: DictConfig, *args_unused: Any, **kwargs_unused: Any) -> "HabitatMixin": policy_config: VLFMPolicyConfig = config.habitat_baselines.rl.policy - kwargs = { - k: policy_config[k] for k in VLFMPolicyConfig.kwaarg_names # type: ignore - } + kwargs = {k: policy_config[k] for k in VLFMPolicyConfig.kwaarg_names} # type: ignore # In habitat, we need the height of the camera to generate the camera transform sim_sensors_cfg = config.habitat.simulator.agents.main_agent.sim_sensors @@ -137,16 +133,12 @@ def act( obs_dict[ObjectGoalSensor.cls_uuid] = HM3D_ID_TO_NAME[object_id] elif self._dataset_type == "mp3d": obs_dict[ObjectGoalSensor.cls_uuid] = MP3D_ID_TO_NAME[object_id] - self._non_coco_caption = ( - " . ".join(MP3D_ID_TO_NAME).replace("|", " . ") + " ." - ) + self._non_coco_caption = " . ".join(MP3D_ID_TO_NAME).replace("|", " . ") + " ." else: raise ValueError(f"Dataset type {self._dataset_type} not recognized") parent_cls: BaseObjectNavPolicy = super() # type: ignore try: - action, rnn_hidden_states = parent_cls.act( - obs_dict, rnn_hidden_states, prev_actions, masks, deterministic - ) + action, rnn_hidden_states = parent_cls.act(obs_dict, rnn_hidden_states, prev_actions, masks, deterministic) except StopIteration: action = self._stop_action return PolicyActionData( @@ -178,9 +170,7 @@ def _get_policy_info(self, detections: ObjectDetections) -> Dict[str, Any]: info["start_yaw"] = self._start_yaw return info - def _cache_observations( - self: Union["HabitatMixin", BaseObjectNavPolicy], observations: TensorDict - ) -> None: + def _cache_observations(self: Union["HabitatMixin", BaseObjectNavPolicy], observations: TensorDict) -> None: """Caches the rgb, depth, and camera transform from the observations. Args: diff --git a/vlfm/policy/itm_policy.py b/vlfm/policy/itm_policy.py index 38c6621..878a124 100644 --- a/vlfm/policy/itm_policy.py +++ b/vlfm/policy/itm_policy.py @@ -89,9 +89,7 @@ def _get_best_frontier( Tuple[np.ndarray, float]: The best frontier and its value. """ # The points and values will be sorted in descending order - sorted_pts, sorted_values = self._sort_frontiers_by_value( - observations, frontiers - ) + sorted_pts, sorted_values = self._sort_frontiers_by_value(observations, frontiers) robot_xy = self._observations_cache["robot_xy"] best_frontier_idx = None top_two_values = tuple(sorted_values[:2]) @@ -110,9 +108,7 @@ def _get_best_frontier( break if curr_index is None: - closest_index = closest_point_within_threshold( - sorted_pts, self._last_frontier, threshold=0.5 - ) + closest_index = closest_point_within_threshold(sorted_pts, self._last_frontier, threshold=0.5) if closest_index != -1: # There is a point close to the last point pursued @@ -131,9 +127,7 @@ def _get_best_frontier( # it is not cyclic. if best_frontier_idx is None: for idx, frontier in enumerate(sorted_pts): - cyclic = self._acyclic_enforcer.check_cyclic( - robot_xy, frontier, top_two_values - ) + cyclic = self._acyclic_enforcer.check_cyclic(robot_xy, frontier, top_two_values) if cyclic: print("Suppressed cyclic frontier.") continue @@ -209,9 +203,7 @@ def _update_value_map(self) -> None: for cosine, (rgb, depth, tf, min_depth, max_depth, fov) in zip( cosines, self._observations_cache["value_map_rgbd"] ): - self._value_map.update_map( - np.array(cosine), depth, tf, min_depth, max_depth, fov - ) + self._value_map.update_map(np.array(cosine), depth, tf, min_depth, max_depth, fov) self._value_map.update_agent_traj( self._observations_cache["robot_xy"], @@ -240,9 +232,7 @@ def act( self._pre_step(observations, masks) if self._visualize: self._update_value_map() - return super().act( - observations, rnn_hidden_states, prev_actions, masks, deterministic - ) + return super().act(observations, rnn_hidden_states, prev_actions, masks, deterministic) def _reset(self) -> None: super()._reset() @@ -268,9 +258,7 @@ def act( ) -> Any: self._pre_step(observations, masks) self._update_value_map() - return super().act( - observations, rnn_hidden_states, prev_actions, masks, deterministic - ) + return super().act(observations, rnn_hidden_states, prev_actions, masks, deterministic) def _sort_frontiers_by_value( self, observations: "TensorDict", frontiers: np.ndarray @@ -301,9 +289,7 @@ def visualize_value_map(arr: np.ndarray) -> np.ndarray: def _sort_frontiers_by_value( self, observations: "TensorDict", frontiers: np.ndarray ) -> Tuple[np.ndarray, List[float]]: - sorted_frontiers, sorted_values = self._value_map.sort_waypoints( - frontiers, 0.5, reduce_fn=self._reduce_values - ) + sorted_frontiers, sorted_values = self._value_map.sort_waypoints(frontiers, 0.5, reduce_fn=self._reduce_values) return sorted_frontiers, sorted_values diff --git a/vlfm/policy/reality_policies.py b/vlfm/policy/reality_policies.py index 2d46422..fe512f4 100644 --- a/vlfm/policy/reality_policies.py +++ b/vlfm/policy/reality_policies.py @@ -35,19 +35,15 @@ class RealityMixin: _policy_info: Dict[str, Any] = {} _done_initializing: bool = False - def __init__( - self: Union["RealityMixin", ITMPolicyV2], *args: Any, **kwargs: Any - ) -> None: + def __init__(self: Union["RealityMixin", ITMPolicyV2], *args: Any, **kwargs: Any) -> None: super().__init__(sync_explored_areas=True, *args, **kwargs) # type: ignore - self._depth_model = torch.hub.load( - "isl-org/ZoeDepth", "ZoeD_NK", config_mode="eval", pretrained=True - ).to("cuda" if torch.cuda.is_available() else "cpu") + self._depth_model = torch.hub.load("isl-org/ZoeDepth", "ZoeD_NK", config_mode="eval", pretrained=True).to( + "cuda" if torch.cuda.is_available() else "cpu" + ) self._object_map.use_dbscan = False # type: ignore @classmethod - def from_config( - cls, config: DictConfig, *args_unused: Any, **kwargs_unused: Any - ) -> Any: + def from_config(cls, config: DictConfig, *args_unused: Any, **kwargs_unused: Any) -> Any: policy_config: VLFMConfig = config.policy kwargs = {k: policy_config[k] for k in VLFMConfig.kwaarg_names} # type: ignore @@ -62,13 +58,9 @@ def act( deterministic: bool = False, ) -> Dict[str, Any]: if observations["objectgoal"] not in self._non_coco_caption: - self._non_coco_caption = ( - observations["objectgoal"] + " . " + self._non_coco_caption - ) + self._non_coco_caption = observations["objectgoal"] + " . " + self._non_coco_caption parent_cls: ITMPolicyV2 = super() # type: ignore - action: Tensor = parent_cls.act( - observations, rnn_hidden_states, prev_actions, masks, deterministic - )[0] + action: Tensor = parent_cls.act(observations, rnn_hidden_states, prev_actions, masks, deterministic)[0] # The output of the policy is a (1, 2) tensor of floats, where the first element # is the linear velocity and the second element is the angular velocity. We @@ -96,9 +88,7 @@ def act( return action_dict - def get_action( - self, observations: Dict[str, Any], masks: Tensor, deterministic: bool = True - ) -> Dict[str, Any]: + def get_action(self, observations: Dict[str, Any], masks: Tensor, deterministic: bool = True) -> Dict[str, Any]: return self.act(observations, None, None, masks, deterministic=deterministic) def _reset(self: Union["RealityMixin", ITMPolicyV2]) -> None: @@ -111,9 +101,7 @@ def _initialize(self) -> Tensor: yaw = self._initial_yaws.pop(0) return torch.tensor([[yaw]], dtype=torch.float32) - def _cache_observations( - self: Union["RealityMixin", ITMPolicyV2], observations: Dict[str, Any] - ) -> None: + def _cache_observations(self: Union["RealityMixin", ITMPolicyV2], observations: Dict[str, Any]) -> None: """Caches the rgb, depth, and camera transform from the observations. Args: @@ -136,9 +124,7 @@ def _cache_observations( explore=False, ) - _, tf, min_depth, max_depth, fx, fy, topdown_fov = observations[ - "obstacle_map_depths" - ][-1] + _, tf, min_depth, max_depth, fx, fy, topdown_fov = observations["obstacle_map_depths"][-1] self._obstacle_map.update_map( None, tf, @@ -151,9 +137,7 @@ def _cache_observations( update_obstacles=False, ) - self._obstacle_map.update_agent_traj( - observations["robot_xy"], observations["robot_heading"] - ) + self._obstacle_map.update_agent_traj(observations["robot_xy"], observations["robot_heading"]) frontiers = self._obstacle_map.frontiers height, width = observations["nav_depth"].shape @@ -169,9 +153,7 @@ def _cache_observations( "value_map_rgbd": observations["value_map_rgbd"], } - def _infer_depth( - self, rgb: np.ndarray, min_depth: float, max_depth: float - ) -> np.ndarray: + def _infer_depth(self, rgb: np.ndarray, min_depth: float, max_depth: float) -> np.ndarray: """Infers the depth image from the rgb image. Args: diff --git a/vlfm/policy/utils/acyclic_enforcer.py b/vlfm/policy/utils/acyclic_enforcer.py index be85bcb..d6762b1 100644 --- a/vlfm/policy/utils/acyclic_enforcer.py +++ b/vlfm/policy/utils/acyclic_enforcer.py @@ -19,15 +19,11 @@ def __hash__(self) -> int: class AcyclicEnforcer: history: Set[StateAction] = set() - def check_cyclic( - self, position: np.ndarray, action: Any, other: Any = None - ) -> bool: + def check_cyclic(self, position: np.ndarray, action: Any, other: Any = None) -> bool: state_action = StateAction(position, action, other) cyclic = state_action in self.history return cyclic - def add_state_action( - self, position: np.ndarray, action: Any, other: Any = None - ) -> None: + def add_state_action(self, position: np.ndarray, action: Any, other: Any = None) -> None: state_action = StateAction(position, action, other) self.history.add(state_action) diff --git a/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py b/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py index 6373e96..cae0612 100644 --- a/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py +++ b/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py @@ -19,16 +19,12 @@ def __init__(self) -> None: self.running_mean_and_var = nn.Sequential() self.backbone = resnet18(1, 32, 16) self.compression = nn.Sequential( - nn.Conv2d( - 256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False - ), + nn.Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), nn.GroupNorm(1, 128, eps=1e-05, affine=True), nn.ReLU(inplace=True), ) - def forward( - self, observations: Dict[str, torch.Tensor] - ) -> torch.Tensor: # type: ignore + def forward(self, observations: Dict[str, torch.Tensor]) -> torch.Tensor: # type: ignore cnn_input = [] for k in self.visual_keys: obs_k = observations[k] @@ -51,9 +47,7 @@ def __init__(self, discrete_actions: bool = False, no_fwd_dict: bool = False): if discrete_actions: self.prev_action_embedding_discrete = nn.Embedding(4 + 1, 32) else: - self.prev_action_embedding_cont = nn.Linear( - in_features=2, out_features=32, bias=True - ) + self.prev_action_embedding_cont = nn.Linear(in_features=2, out_features=32, bias=True) self.tgt_embeding = nn.Linear(in_features=3, out_features=32, bias=True) self.visual_encoder = ResNetEncoder() self.visual_fc = nn.Sequential( @@ -104,9 +98,7 @@ def forward( x.append(prev_actions) out = torch.cat(x, dim=1) - out, rnn_hidden_states = self.state_encoder( - out, rnn_hidden_states, masks, rnn_build_seq_info - ) + out, rnn_hidden_states = self.state_encoder(out, rnn_hidden_states, masks, rnn_build_seq_info) if self.no_fwd_dict: return out, rnn_hidden_states # type: ignore @@ -159,9 +151,7 @@ def act( masks: torch.Tensor, deterministic: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor]: - features, rnn_hidden_states, _ = self.net( - observations, rnn_hidden_states, prev_actions, masks - ) + features, rnn_hidden_states, _ = self.net(observations, rnn_hidden_states, prev_actions, masks) distribution = self.action_distribution(features) if deterministic: diff --git a/vlfm/policy/utils/non_habitat_policy/resnet.py b/vlfm/policy/utils/non_habitat_policy/resnet.py index e1a18c2..9767db8 100755 --- a/vlfm/policy/utils/non_habitat_policy/resnet.py +++ b/vlfm/policy/utils/non_habitat_policy/resnet.py @@ -48,9 +48,7 @@ def forward(self, x: Tensor) -> Tensor: return self.relu(out + residual) -def conv3x3( - in_planes: int, out_planes: int, stride: int = 1, groups: int = 1 -) -> Conv2d: +def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1) -> Conv2d: """3x3 convolution with padding""" return nn.Conv2d( in_planes, @@ -98,18 +96,10 @@ def __init__( if block.resneXt: base_planes *= 2 - self.layer1 = self._make_layer( - block, ngroups, base_planes, layers[0] # type: ignore - ) - self.layer2 = self._make_layer( - block, ngroups, base_planes * 2, layers[1], stride=2 # type: ignore - ) - self.layer3 = self._make_layer( - block, ngroups, base_planes * 2 * 2, layers[2], stride=2 # type: ignore - ) - self.layer4 = self._make_layer( - block, ngroups, base_planes * 2 * 2 * 2, layers[3], stride=2 # type: ignore - ) + self.layer1 = self._make_layer(block, ngroups, base_planes, layers[0]) # type: ignore + self.layer2 = self._make_layer(block, ngroups, base_planes * 2, layers[1], stride=2) # type: ignore + self.layer3 = self._make_layer(block, ngroups, base_planes * 2 * 2, layers[2], stride=2) # type: ignore + self.layer4 = self._make_layer(block, ngroups, base_planes * 2 * 2 * 2, layers[3], stride=2) # type: ignore self.final_channels = self.inplanes self.final_spatial_compress = 1.0 / (2**5) diff --git a/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py b/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py index 56a242a..e518b2b 100755 --- a/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py +++ b/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py @@ -57,9 +57,7 @@ def single_forward( ) -> Tuple[torch.Tensor, torch.Tensor]: r"""Forward for a non-sequence input""" - hidden_states = torch.where( - masks.view(1, -1, 1), hidden_states, hidden_states.new_zeros(()) - ) + hidden_states = torch.where(masks.view(1, -1, 1), hidden_states, hidden_states.new_zeros(())) x, hidden_states = self.rnn(x.unsqueeze(0), self.unpack_hidden(hidden_states)) hidden_states = self.pack_hidden(hidden_states) @@ -114,9 +112,7 @@ def forward( x, hidden_states = self.single_forward(x, hidden_states, masks) else: assert rnn_build_seq_info is not None - x, hidden_states = self.seq_forward( - x, hidden_states, masks, rnn_build_seq_info - ) + x, hidden_states = self.seq_forward(x, hidden_states, masks, rnn_build_seq_info) hidden_states = hidden_states.permute(1, 0, 2) @@ -134,14 +130,10 @@ def __init__( # Note: Type handling mypy errors in pytorch libraries prevent # directly setting hidden_states type - def pack_hidden( - self, hidden_states: Any # type is Tuple[torch.Tensor, torch.Tensor] - ) -> torch.Tensor: + def pack_hidden(self, hidden_states: Any) -> torch.Tensor: # type is Tuple[torch.Tensor, torch.Tensor] return torch.cat(hidden_states, 0) - def unpack_hidden( - self, hidden_states: torch.Tensor - ) -> Any: # type is Tuple[torch.Tensor, torch.Tensor] + def unpack_hidden(self, hidden_states: torch.Tensor) -> Any: # type is Tuple[torch.Tensor, torch.Tensor] lstm_states = torch.chunk(hidden_states.contiguous(), 2, 0) return (lstm_states[0], lstm_states[1]) @@ -226,9 +218,7 @@ def build_rnn_out_from_seq( rnn_state_batch_inds = rnn_build_seq_info["rnn_state_batch_inds"] output_hidden_states = hidden_states.index_select( 1, - last_sequence_in_batch_inds[ - _invert_permutation(rnn_state_batch_inds[last_sequence_in_batch_inds]) - ], + last_sequence_in_batch_inds[_invert_permutation(rnn_state_batch_inds[last_sequence_in_batch_inds])], ) return x, output_hidden_states diff --git a/vlfm/policy/utils/pointnav_policy.py b/vlfm/policy/utils/pointnav_policy.py index a030ae5..517d067 100644 --- a/vlfm/policy/utils/pointnav_policy.py +++ b/vlfm/policy/utils/pointnav_policy.py @@ -22,9 +22,7 @@ class PointNavResNetTensorOutputPolicy(PointNavResNetPolicy): def act(self, *args: Any, **kwargs: Any) -> Tuple[Tensor, Tensor]: - value, action, action_log_probs, rnn_hidden_states = super().act( - *args, **kwargs - ) + value, action, action_log_probs, rnn_hidden_states = super().act(*args, **kwargs) return action, rnn_hidden_states else: @@ -126,9 +124,7 @@ def reset(self) -> None: """ Resets the hidden state and previous action for the policy. """ - self.pointnav_test_recurrent_hidden_states = torch.zeros_like( - self.pointnav_test_recurrent_hidden_states - ) + self.pointnav_test_recurrent_hidden_states = torch.zeros_like(self.pointnav_test_recurrent_hidden_states) self.pointnav_prev_actions = torch.zeros_like(self.pointnav_prev_actions) @@ -143,9 +139,7 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: if HABITAT_BASELINES_AVAILABLE: obs_space = SpaceDict( { - "depth": spaces.Box( - low=0.0, high=1.0, shape=(224, 224, 1), dtype=np.float32 - ), + "depth": spaces.Box(low=0.0, high=1.0, shape=(224, 224, 1), dtype=np.float32), "pointgoal_with_gps_compass": spaces.Box( low=np.finfo(np.float32).min, high=np.finfo(np.float32).max, @@ -174,15 +168,11 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: ) # print(pointnav_policy) - pointnav_policy.net = PointNavResNetNet( - discrete_actions=True, no_fwd_dict=True - ) + pointnav_policy.net = PointNavResNetNet(discrete_actions=True, no_fwd_dict=True) state_dict = torch.load(file_path + ".state_dict", map_location="cpu") else: ckpt_dict = torch.load(file_path, map_location="cpu") - pointnav_policy = PointNavResNetTensorOutputPolicy.from_config( - ckpt_dict["config"], obs_space, action_space - ) + pointnav_policy = PointNavResNetTensorOutputPolicy.from_config(ckpt_dict["config"], obs_space, action_space) state_dict = ckpt_dict["state_dict"] pointnav_policy.load_state_dict(state_dict) return pointnav_policy @@ -193,22 +183,13 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: current_state_dict = pointnav_policy.state_dict() # Let old checkpoints work with new code if "net.prev_action_embedding_cont.bias" not in ckpt_dict.keys(): - ckpt_dict["net.prev_action_embedding_cont.bias"] = ckpt_dict[ - "net.prev_action_embedding.bias" - ] + ckpt_dict["net.prev_action_embedding_cont.bias"] = ckpt_dict["net.prev_action_embedding.bias"] if "net.prev_action_embedding_cont.weights" not in ckpt_dict.keys(): - ckpt_dict["net.prev_action_embedding_cont.weight"] = ckpt_dict[ - "net.prev_action_embedding.weight" - ] + ckpt_dict["net.prev_action_embedding_cont.weight"] = ckpt_dict["net.prev_action_embedding.weight"] - pointnav_policy.load_state_dict( - {k: v for k, v in ckpt_dict.items() if k in current_state_dict} - ) + pointnav_policy.load_state_dict({k: v for k, v in ckpt_dict.items() if k in current_state_dict}) unused_keys = [k for k in ckpt_dict.keys() if k not in current_state_dict] - print( - "The following unused keys were not loaded when loading the pointnav" - f" policy: {unused_keys}" - ) + print(f"The following unused keys were not loaded when loading the pointnav policy: {unused_keys}") return pointnav_policy diff --git a/vlfm/reality/bdsw_nav_env.py b/vlfm/reality/bdsw_nav_env.py index 93cd6d5..81b8868 100644 --- a/vlfm/reality/bdsw_nav_env.py +++ b/vlfm/reality/bdsw_nav_env.py @@ -9,9 +9,7 @@ from vlfm.reality.robots.bdsw_robot import BDSWRobot -def run_env( - env: PointNavEnv, policy: WrappedPointNavResNetPolicy, goal: np.ndarray -) -> None: +def run_env(env: PointNavEnv, policy: WrappedPointNavResNetPolicy, goal: np.ndarray) -> None: observations = env.reset(goal) done = False mask = torch.zeros(1, 1, device=policy.device, dtype=torch.bool) diff --git a/vlfm/reality/objectnav_env.py b/vlfm/reality/objectnav_env.py index 78b9cfa..34bfcfb 100644 --- a/vlfm/reality/objectnav_env.py +++ b/vlfm/reality/objectnav_env.py @@ -103,9 +103,7 @@ def step(self, action: Dict[str, Any]) -> Tuple[Dict, float, bool, Dict]: return super().step(action) if action["arm_yaw"] == 0: - cmd_id = self.robot.spot.move_gripper_to_point( - np.array([0.35, 0.0, 0.3]), np.deg2rad([0.0, 0.0, 0.0]) - ) + cmd_id = self.robot.spot.move_gripper_to_point(np.array([0.35, 0.0, 0.3]), np.deg2rad([0.0, 0.0, 0.0])) self.robot.spot.block_until_arm_arrives(cmd_id, timeout_sec=1.5) else: new_pose = np.array(NOMINAL_ARM_POSE) @@ -119,9 +117,7 @@ def step(self, action: Dict[str, Any]) -> Tuple[Dict, float, bool, Dict]: def _get_obs(self) -> Dict[str, Any]: robot_xy, robot_heading = self._get_gps(), self._get_compass() - nav_depth, obstacle_map_depths, value_map_rgbd, object_map_rgbd = ( - self._get_camera_obs() - ) + nav_depth, obstacle_map_depths, value_map_rgbd, object_map_rgbd = self._get_camera_obs() return { "nav_depth": nav_depth, "robot_xy": robot_xy, @@ -142,9 +138,7 @@ def _get_camera_obs(self) -> Tuple[np.ndarray, List, List, List]: for src in ALL_CAMS: tf = self.tf_global_to_episodic @ cam_data[src]["tf_camera_to_global"] # a tf that remaps from camera conventions to xyz conventions - rotation_matrix = np.array( - [[0, -1, 0, 0], [0, 0, -1, 0], [1, 0, 0, 0], [0, 0, 0, 1]] - ) + rotation_matrix = np.array([[0, -1, 0, 0], [0, 0, -1, 0], [1, 0, 0, 0], [0, 0, 0, 1]]) cam_data[src]["tf_camera_to_global"] = np.dot(tf, rotation_matrix) img = cam_data[src]["image"] @@ -183,9 +177,7 @@ def _get_camera_obs(self) -> Tuple[np.ndarray, List, List, List]: # to be upright f_left = SpotCamIds.FRONTLEFT_DEPTH f_right = SpotCamIds.FRONTRIGHT_DEPTH - nav_cam_data = self.robot.reorient_images( - {k: cam_data[k]["image"] for k in [f_left, f_right]} - ) + nav_cam_data = self.robot.reorient_images({k: cam_data[k]["image"] for k in [f_left, f_right]}) nav_depth = np.hstack([nav_cam_data[f_right], nav_cam_data[f_left]]) nav_depth = filter_depth(nav_depth, blur_type=None, set_black_value=1.0) diff --git a/vlfm/reality/pointnav_env.py b/vlfm/reality/pointnav_env.py index ee5dfbf..afefbde 100644 --- a/vlfm/reality/pointnav_env.py +++ b/vlfm/reality/pointnav_env.py @@ -41,9 +41,7 @@ def __init__( self._cmd_id: Union[None, Any] = None self._num_steps = 0 - def reset( - self, goal: Any, relative: bool = True, *args: Any, **kwargs: Any - ) -> Dict[str, np.ndarray]: + def reset(self, goal: Any, relative: bool = True, *args: Any, **kwargs: Any) -> Dict[str, np.ndarray]: assert isinstance(goal, np.ndarray) if relative: # Transform (x,y) goal from robot frame to global frame @@ -129,18 +127,12 @@ def _get_obs(self) -> Dict[str, np.ndarray]: } def _get_nav_depth(self) -> np.ndarray: - images = self.robot.get_camera_images( - [SpotCamIds.FRONTRIGHT_DEPTH, SpotCamIds.FRONTLEFT_DEPTH] - ) - img = np.hstack( - [images[SpotCamIds.FRONTRIGHT_DEPTH], images[SpotCamIds.FRONTLEFT_DEPTH]] - ) + images = self.robot.get_camera_images([SpotCamIds.FRONTRIGHT_DEPTH, SpotCamIds.FRONTLEFT_DEPTH]) + img = np.hstack([images[SpotCamIds.FRONTRIGHT_DEPTH], images[SpotCamIds.FRONTLEFT_DEPTH]]) img = self._norm_depth(img) return img - def _norm_depth( - self, depth: np.ndarray, max_depth: Optional[float] = None, scale: bool = True - ) -> np.ndarray: + def _norm_depth(self, depth: np.ndarray, max_depth: Optional[float] = None, scale: bool = True) -> np.ndarray: if max_depth is None: max_depth = self._max_body_cam_depth diff --git a/vlfm/reality/robots/base_robot.py b/vlfm/reality/robots/base_robot.py index 903ebeb..120a8eb 100644 --- a/vlfm/reality/robots/base_robot.py +++ b/vlfm/reality/robots/base_robot.py @@ -103,9 +103,7 @@ def get_camera_images(self, camera_source: List[str]) -> Dict[str, np.ndarray]: """ for source in camera_source: assert source in CAM_ID_TO_SHAPE, f"Invalid camera source: {source}" - images = { - source: np.random.rand(*CAM_ID_TO_SHAPE[source]) for source in camera_source - } + images = {source: np.random.rand(*CAM_ID_TO_SHAPE[source]) for source in camera_source} return self.reorient_images(images) def command_base_velocity(self, ang_vel: float, lin_vel: float) -> None: diff --git a/vlfm/reality/robots/bdsw_robot.py b/vlfm/reality/robots/bdsw_robot.py index 3b958e5..28b6d24 100644 --- a/vlfm/reality/robots/bdsw_robot.py +++ b/vlfm/reality/robots/bdsw_robot.py @@ -25,9 +25,7 @@ def xy_yaw(self) -> Tuple[np.ndarray, float]: def arm_joints(self) -> np.ndarray: """Returns current angle for each of the 6 arm joints in radians""" arm_proprioception = self.spot.get_arm_proprioception() - current_positions = np.array( - [v.position.value for v in arm_proprioception.values()] - ) + current_positions = np.array([v.position.value for v in arm_proprioception.values()]) return current_positions def get_camera_images(self, camera_source: List[str]) -> Dict[str, np.ndarray]: @@ -101,8 +99,7 @@ def get_camera_data(self, srcs: List[str]) -> Dict[str, Dict[str, Any]]: """ image_responses = self.spot.get_image_responses(srcs) imgs = { - src: self._camera_response_to_data(image_response) - for src, image_response in zip(srcs, image_responses) + src: self._camera_response_to_data(image_response) for src, image_response in zip(srcs, image_responses) } return imgs @@ -116,7 +113,5 @@ def _camera_response_to_data(self, response: Any) -> Dict[str, Any]: "image": image, "fx": fx, "fy": fy, - "tf_camera_to_global": self.spot.get_transform( - from_frame=camera_frame, tf_snapshot=tf_snapshot - ), + "tf_camera_to_global": self.spot.get_transform(from_frame=camera_frame, tf_snapshot=tf_snapshot), } diff --git a/vlfm/reality/run_bdsw_objnav_env.py b/vlfm/reality/run_bdsw_objnav_env.py index 46edf3e..1bb9004 100644 --- a/vlfm/reality/run_bdsw_objnav_env.py +++ b/vlfm/reality/run_bdsw_objnav_env.py @@ -13,9 +13,7 @@ from vlfm.reality.robots.bdsw_robot import BDSWRobot -@hydra.main( - version_base=None, config_path="../../config/", config_name="experiments/reality" -) +@hydra.main(version_base=None, config_path="../../config/", config_name="experiments/reality") def main(cfg: RealityConfig) -> None: print(OmegaConf.to_yaml(cfg)) policy = RealityITMPolicyV2.from_config(cfg) @@ -27,9 +25,7 @@ def main(cfg: RealityConfig) -> None: robot = BDSWRobot(spot) robot.open_gripper() # robot.set_arm_joints(NOMINAL_ARM_POSE, travel_time=0.75) - cmd_id = robot.spot.move_gripper_to_point( - np.array([0.35, 0.0, 0.6]), np.deg2rad([0.0, 20.0, 0.0]) - ) + cmd_id = robot.spot.move_gripper_to_point(np.array([0.35, 0.0, 0.6]), np.deg2rad([0.0, 20.0, 0.0])) spot.block_until_arm_arrives(cmd_id, timeout_sec=1.5) env = ObjectNavEnv( robot=robot, diff --git a/vlfm/semexp_env/eval.py b/vlfm/semexp_env/eval.py index 3dc846a..05a8398 100644 --- a/vlfm/semexp_env/eval.py +++ b/vlfm/semexp_env/eval.py @@ -65,8 +65,7 @@ def main() -> None: policy_cls = SemExpITMPolicyV3 policy_kwargs["exploration_thresh"] = exp_thresh policy_kwargs["text_prompt"] = ( - "Seems like there is a target_object ahead.|There is a lot of area to" - " explore ahead." + "Seems like there is a target_object ahead.|There is a lot of area to explore ahead." ) else: policy_cls = SemExpITMPolicyV2 @@ -126,17 +125,13 @@ def main() -> None: print("Test successfully completed") -def merge_obs_infos( - obs: torch.Tensor, infos: Tuple[Dict, ...] -) -> Dict[str, torch.Tensor]: +def merge_obs_infos(obs: torch.Tensor, infos: Tuple[Dict, ...]) -> Dict[str, torch.Tensor]: """Merge the observations and infos into a single dictionary.""" rgb = obs[:, :3, ...].permute(0, 2, 3, 1) depth = obs[:, 3:4, ...].permute(0, 2, 3, 1) info_dict = infos[0] - def tensor_from_numpy( - tensor: torch.Tensor, numpy_array: np.ndarray - ) -> torch.Tensor: + def tensor_from_numpy(tensor: torch.Tensor, numpy_array: np.ndarray) -> torch.Tensor: device = tensor.device new_tensor = torch.from_numpy(numpy_array).to(device) return new_tensor @@ -177,9 +172,7 @@ def create_frame(policy_infos: Dict[str, Any]) -> np.ndarray: return vis_img -def generate_video( - frames: List[np.ndarray], ep_id: str, scene_id: str, infos: Dict[str, Any] -) -> None: +def generate_video(frames: List[np.ndarray], ep_id: str, scene_id: str, infos: Dict[str, Any]) -> None: """ Saves the given list of rgb frames as a video at 10 FPS. Uses the infos to get the files name, which should contain the following: @@ -200,8 +193,7 @@ def generate_video( dtg = infos["distance_to_goal"] goal_name = infos["target_object"] filename = ( - f"epid={episode_id:03d}-scid={scene_id}-succ={success}-spl={spl:.2f}" - f"-dtg={dtg:.2f}-target={goal_name}.mp4" + f"epid={episode_id:03d}-scid={scene_id}-succ={success}-spl={spl:.2f}-dtg={dtg:.2f}-target={goal_name}.mp4" ) filename = os.path.join(video_dir, filename) diff --git a/vlfm/utils/episode_stats_logger.py b/vlfm/utils/episode_stats_logger.py index 892180c..f0165d6 100644 --- a/vlfm/utils/episode_stats_logger.py +++ b/vlfm/utils/episode_stats_logger.py @@ -85,9 +85,7 @@ def was_false_positive(infos: Dict[str, Any]) -> bool: """Return whether the point goal target is within a bounding box.""" target_bboxes_mask = infos["top_down_map"]["target_bboxes_mask"] nav_goal_episodic_xy = infos["nav_goal"] - nav_goal_episodic_xyz = np.array( - [nav_goal_episodic_xy[0], nav_goal_episodic_xy[1], 0] - ).reshape(1, 3) + nav_goal_episodic_xyz = np.array([nav_goal_episodic_xy[0], nav_goal_episodic_xy[1], 0]).reshape(1, 3) upper_bound = infos["top_down_map"]["upper_bound"] lower_bound = infos["top_down_map"]["lower_bound"] diff --git a/vlfm/utils/geometry_utils.py b/vlfm/utils/geometry_utils.py index abd793b..5cc2335 100644 --- a/vlfm/utils/geometry_utils.py +++ b/vlfm/utils/geometry_utils.py @@ -6,9 +6,7 @@ import numpy as np -def rho_theta( - curr_pos: np.ndarray, curr_heading: float, curr_goal: np.ndarray -) -> Tuple[float, float]: +def rho_theta(curr_pos: np.ndarray, curr_heading: float, curr_goal: np.ndarray) -> Tuple[float, float]: """Calculates polar coordinates (rho, theta) relative to a given position and heading to a given goal position. 'rho' is the distance from the agent to the goal, and theta is how many radians the agent must turn (to the left, CCW from above) to @@ -82,14 +80,10 @@ def calculate_vfov(hfov: float, width: int, height: int) -> float: A float representing the VFOV in radians. """ # Calculate the diagonal field of view (DFOV) - dfov = 2 * math.atan( - math.tan(hfov / 2) * math.sqrt((width**2 + height**2) / (width**2 + height**2)) - ) + dfov = 2 * math.atan(math.tan(hfov / 2) * math.sqrt((width**2 + height**2) / (width**2 + height**2))) # Calculate the vertical field of view (VFOV) - vfov = 2 * math.atan( - math.tan(dfov / 2) * (height / math.sqrt(width**2 + height**2)) - ) + vfov = 2 * math.atan(math.tan(dfov / 2) * (height / math.sqrt(width**2 + height**2))) return vfov @@ -122,9 +116,7 @@ def within_fov_cone( return points[mask] -def convert_to_global_frame( - agent_pos: np.ndarray, agent_yaw: float, local_pos: np.ndarray -) -> np.ndarray: +def convert_to_global_frame(agent_pos: np.ndarray, agent_yaw: float, local_pos: np.ndarray) -> np.ndarray: """Converts a given position from the agent's local frame to the global frame. Args: @@ -188,9 +180,7 @@ def xyz_yaw_to_tf_matrix(xyz: np.ndarray, yaw: float) -> np.ndarray: return transformation_matrix -def closest_point_within_threshold( - points_array: np.ndarray, target_point: np.ndarray, threshold: float -) -> int: +def closest_point_within_threshold(points_array: np.ndarray, target_point: np.ndarray, threshold: float) -> int: """Find the point within the threshold distance that is closest to the target_point. Args: @@ -202,10 +192,7 @@ def closest_point_within_threshold( Returns: int: The index of the closest point within the threshold distance. """ - distances = np.sqrt( - (points_array[:, 0] - target_point[0]) ** 2 - + (points_array[:, 1] - target_point[1]) ** 2 - ) + distances = np.sqrt((points_array[:, 0] - target_point[0]) ** 2 + (points_array[:, 1] - target_point[1]) ** 2) within_threshold = distances <= threshold if np.any(within_threshold): @@ -215,9 +202,7 @@ def closest_point_within_threshold( return -1 -def transform_points( - transformation_matrix: np.ndarray, points: np.ndarray -) -> np.ndarray: +def transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray: # Add a homogeneous coordinate of 1 to each point for matrix multiplication homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1)))) @@ -228,9 +213,7 @@ def transform_points( return transformed_points[:, :3] / transformed_points[:, 3:] -def get_point_cloud( - depth_image: np.ndarray, mask: np.ndarray, fx: float, fy: float -) -> np.ndarray: +def get_point_cloud(depth_image: np.ndarray, mask: np.ndarray, fx: float, fy: float) -> np.ndarray: """Calculates the 3D coordinates (x, y, z) of points in the depth image based on the horizontal field of view (HFOV), the image width and height, the depth values, and the pixel x and y coordinates. diff --git a/vlfm/utils/habitat_visualizer.py b/vlfm/utils/habitat_visualizer.py index 7f622cf..1627674 100644 --- a/vlfm/utils/habitat_visualizer.py +++ b/vlfm/utils/habitat_visualizer.py @@ -67,9 +67,7 @@ def collect_data( # Visualize target point cloud on the map color_point_cloud_on_map(infos, policy_info) - map = maps.colorize_draw_agent_and_fit_to_height( - infos[0]["top_down_map"], self.depth[0].shape[0] - ) + map = maps.colorize_draw_agent_and_fit_to_height(infos[0]["top_down_map"], self.depth[0].shape[0]) self.maps.append(map) vis_map_imgs = [ self._reorient_rescale_habitat_map(infos, policy_info[0][vkey]) @@ -121,9 +119,7 @@ def flush_frames(self, failure_cause: str) -> List[np.ndarray]: return frames @staticmethod - def _reorient_rescale_habitat_map( - infos: List[Dict[str, Any]], vis_map: np.ndarray - ) -> np.ndarray: + def _reorient_rescale_habitat_map(infos: List[Dict[str, Any]], vis_map: np.ndarray) -> np.ndarray: # Rotate the cost map to match the agent's orientation at the start # of the episode start_yaw = infos[0]["start_yaw"] @@ -228,9 +224,7 @@ def sim_xy_to_grid_xy( return grid_xy -def color_point_cloud_on_map( - infos: List[Dict[str, Any]], policy_info: List[Dict[str, Any]] -) -> None: +def color_point_cloud_on_map(infos: List[Dict[str, Any]], policy_info: List[Dict[str, Any]]) -> None: if len(policy_info[0]["target_point_cloud"]) == 0: return @@ -240,9 +234,7 @@ def color_point_cloud_on_map( tf_episodic_to_global = infos[0]["top_down_map"]["tf_episodic_to_global"] cloud_episodic_frame = policy_info[0]["target_point_cloud"][:, :3] - cloud_global_frame_xyz = transform_points( - tf_episodic_to_global, cloud_episodic_frame - ) + cloud_global_frame_xyz = transform_points(tf_episodic_to_global, cloud_episodic_frame) cloud_global_frame_habitat = xyz_to_habitat(cloud_global_frame_xyz) cloud_global_frame_habitat_xy = cloud_global_frame_habitat[:, [2, 0]] diff --git a/vlfm/utils/img_utils.py b/vlfm/utils/img_utils.py index fe5cb0c..fa950d3 100644 --- a/vlfm/utils/img_utils.py +++ b/vlfm/utils/img_utils.py @@ -23,16 +23,12 @@ def rotate_image( height, width = image.shape[0], image.shape[1] center = (width // 2, height // 2) rotation_matrix = cv2.getRotationMatrix2D(center, np.degrees(radians), 1.0) - rotated_image = cv2.warpAffine( - image, rotation_matrix, (width, height), borderValue=border_value - ) + rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height), borderValue=border_value) return rotated_image -def place_img_in_img( - img1: np.ndarray, img2: np.ndarray, row: int, col: int -) -> np.ndarray: +def place_img_in_img(img1: np.ndarray, img2: np.ndarray, row: int, col: int) -> np.ndarray: """Place img2 in img1 such that img2's center is at the specified coordinates (xy) in img1. @@ -44,9 +40,7 @@ def place_img_in_img( Returns: numpy.ndarray: The updated base image with img2 placed. """ - assert ( - 0 <= row < img1.shape[0] and 0 <= col < img1.shape[1] - ), "Pixel location is outside the image." + assert 0 <= row < img1.shape[0] and 0 <= col < img1.shape[1], "Pixel location is outside the image." top = row - img2.shape[0] // 2 left = col - img2.shape[1] // 2 bottom = top + img2.shape[0] @@ -62,9 +56,7 @@ def place_img_in_img( img2_bottom = img2_top + (img1_bottom - img1_top) img2_right = img2_left + (img1_right - img1_left) - img1[img1_top:img1_bottom, img1_left:img1_right] = img2[ - img2_top:img2_bottom, img2_left:img2_right - ] + img1[img1_top:img1_bottom, img1_left:img1_right] = img2[img2_top:img2_bottom, img2_left:img2_right] return img1 @@ -88,16 +80,12 @@ def monochannel_to_inferno_rgb(image: np.ndarray) -> np.ndarray: normalized_image = (image - min_val) / peak_to_peak # Apply the Inferno colormap - inferno_colormap = cv2.applyColorMap( - (normalized_image * 255).astype(np.uint8), cv2.COLORMAP_INFERNO - ) + inferno_colormap = cv2.applyColorMap((normalized_image * 255).astype(np.uint8), cv2.COLORMAP_INFERNO) return inferno_colormap -def resize_images( - images: List[np.ndarray], match_dimension: str = "height", use_max: bool = True -) -> List[np.ndarray]: +def resize_images(images: List[np.ndarray], match_dimension: str = "height", use_max: bool = True) -> List[np.ndarray]: """ Resize images to match either their heights or their widths. @@ -118,18 +106,14 @@ def resize_images( else: new_height = min(img.shape[0] for img in images) resized_images = [ - cv2.resize(img, (int(img.shape[1] * new_height / img.shape[0]), new_height)) - for img in images + cv2.resize(img, (int(img.shape[1] * new_height / img.shape[0]), new_height)) for img in images ] elif match_dimension == "width": if use_max: new_width = max(img.shape[1] for img in images) else: new_width = min(img.shape[1] for img in images) - resized_images = [ - cv2.resize(img, (new_width, int(img.shape[0] * new_width / img.shape[1]))) - for img in images - ] + resized_images = [cv2.resize(img, (new_width, int(img.shape[0] * new_width / img.shape[1]))) for img in images] else: raise ValueError("Invalid 'match_dimension' argument. Use 'height' or 'width'.") @@ -185,9 +169,7 @@ def pad_to_square( height, width, _ = img.shape larger_side = max(height, width) square_size = larger_side + extra_pad - padded_img = np.ones((square_size, square_size, 3), dtype=np.uint8) * np.array( - padding_color, dtype=np.uint8 - ) + padded_img = np.ones((square_size, square_size, 3), dtype=np.uint8) * np.array(padding_color, dtype=np.uint8) padded_img = place_img_in_img(padded_img, img, square_size // 2, square_size // 2) return padded_img @@ -251,8 +233,7 @@ def pixel_value_within_radius( """ # Ensure that the pixel location is within the image assert ( - 0 <= pixel_location[0] < image.shape[0] - and 0 <= pixel_location[1] < image.shape[1] + 0 <= pixel_location[0] < image.shape[0] and 0 <= pixel_location[1] < image.shape[1] ), "Pixel location is outside the image." top_left_x = max(0, pixel_location[0] - radius) @@ -285,9 +266,7 @@ def pixel_value_within_radius( raise ValueError(f"Invalid reduction method: {reduction}") -def median_blur_normalized_depth_image( - depth_image: np.ndarray, ksize: int -) -> np.ndarray: +def median_blur_normalized_depth_image(depth_image: np.ndarray, ksize: int) -> np.ndarray: """Applies a median blur to a normalized depth image. This function first converts the normalized depth image to a uint8 image, @@ -337,9 +316,7 @@ def reorient_rescale_map(vis_map_img: np.ndarray) -> np.ndarray: # Pad the shorter dimension to be the same size as the longer vis_map_img = pad_to_square(vis_map_img, extra_pad=50) # Pad the image border with some white space - vis_map_img = cv2.copyMakeBorder( - vis_map_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=(255, 255, 255) - ) + vis_map_img = cv2.copyMakeBorder(vis_map_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=(255, 255, 255)) return vis_map_img diff --git a/vlfm/utils/log_saver.py b/vlfm/utils/log_saver.py index c1d200a..450f446 100644 --- a/vlfm/utils/log_saver.py +++ b/vlfm/utils/log_saver.py @@ -19,9 +19,7 @@ def log_episode(episode_id: Union[str, int], scene_id: str, data: Dict) -> None: if not (os.path.exists(filename) and os.path.getsize(filename) > 0): print(f"Logging episode {int(episode_id):04d} to {filename}") with open(filename, "w") as f: - json.dump( - {"episode_id": episode_id, "scene_id": scene_id, **data}, f, indent=4 - ) + json.dump({"episode_id": episode_id, "scene_id": scene_id, **data}, f, indent=4) def is_evaluated(episode_id: Union[str, int], scene_id: str) -> bool: diff --git a/vlfm/utils/visualization.py b/vlfm/utils/visualization.py index ac4577c..cfaa749 100644 --- a/vlfm/utils/visualization.py +++ b/vlfm/utils/visualization.py @@ -95,9 +95,7 @@ def generate_text_image(width: int, text: str) -> np.ndarray: return image -def pad_images( - images: List[np.ndarray], pad_from_top: bool = False -) -> List[np.ndarray]: +def pad_images(images: List[np.ndarray], pad_from_top: bool = False) -> List[np.ndarray]: """ Pads a list of images with white pixels to make them have the same dimensions. diff --git a/vlfm/utils/vlfm_trainer.py b/vlfm/utils/vlfm_trainer.py index db344e2..a002b2f 100644 --- a/vlfm/utils/vlfm_trainer.py +++ b/vlfm/utils/vlfm_trainer.py @@ -99,9 +99,7 @@ def _eval_checkpoint( self._init_envs(config, is_eval=True) self._agent = self._create_agent(None) - action_shape, discrete_actions = get_action_space_info( - self._agent.policy_action_space - ) + action_shape, discrete_actions = get_action_space_info(self._agent.policy_action_space) if self._agent.actor_critic.should_load_agent_state: self._agent.load_state_dict(ckpt_dict) @@ -131,14 +129,10 @@ def _eval_checkpoint( device=self.device, dtype=torch.bool, ) - stats_episodes: Dict[Any, Any] = ( - {} - ) # dict of dicts that stores stats per episode + stats_episodes: Dict[Any, Any] = {} # dict of dicts that stores stats per episode ep_eval_count: Dict[Any, int] = defaultdict(lambda: 0) - rgb_frames: List[List[np.ndarray]] = [ - [] for _ in range(self.config.habitat_baselines.num_environments) - ] + rgb_frames: List[List[np.ndarray]] = [[] for _ in range(self.config.habitat_baselines.num_environments)] if len(self.config.habitat_baselines.eval.video_option) > 0: os.makedirs(self.config.habitat_baselines.video_dir, exist_ok=True) @@ -151,16 +145,13 @@ def _eval_checkpoint( # if total_num_eps is negative, it means the number of evaluation episodes is unknown if total_num_eps < number_of_eval_episodes and total_num_eps > 1: logger.warn( - f"Config specified {number_of_eval_episodes} eval episodes" - ", dataset only has {total_num_eps}." + f"Config specified {number_of_eval_episodes} eval episodes, dataset only has {{total_num_eps}}." ) logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps else: assert evals_per_ep == 1 - assert ( - number_of_eval_episodes > 0 - ), "You must specify a number of evaluation episodes with test_episode_count" + assert number_of_eval_episodes > 0, "You must specify a number of evaluation episodes with test_episode_count" pbar = tqdm.tqdm(total=number_of_eval_episodes * evals_per_ep) self._agent.eval() @@ -170,10 +161,7 @@ def _eval_checkpoint( num_successes = 0 num_total = 0 hab_vis = HabitatVis() - while ( - len(stats_episodes) < (number_of_eval_episodes * evals_per_ep) - and self.envs.num_envs > 0 - ): + while len(stats_episodes) < (number_of_eval_episodes * evals_per_ep) and self.envs.num_envs > 0: current_episodes_info = self.envs.current_episodes() with inference_mode(): @@ -202,9 +190,7 @@ def _eval_checkpoint( else: for i, should_insert in enumerate(action_data.should_inserts): if should_insert.item(): - test_recurrent_hidden_states[i] = ( - action_data.rnn_hidden_states[i] - ) + test_recurrent_hidden_states[i] = action_data.rnn_hidden_states[i] prev_actions[i].copy_(action_data.actions[i]) # type: ignore # NB: Move actions to CPU. If CUDA tensors are # sent in to env.step(), that will create CUDA contexts @@ -240,9 +226,7 @@ def _eval_checkpoint( device="cpu", ) - rewards = torch.tensor( - rewards_l, dtype=torch.float, device="cpu" - ).unsqueeze(1) + rewards = torch.tensor(rewards_l, dtype=torch.float, device="cpu").unsqueeze(1) current_episode_reward += rewards next_episodes_info = self.envs.current_episodes() envs_to_pause = [] @@ -281,10 +265,7 @@ def _eval_checkpoint( if episode_stats["success"] == 1: num_successes += 1 num_total += 1 - print( - f"Success rate: {num_successes / num_total * 100:.2f}% " - f"({num_successes} out of {num_total})" - ) + print(f"Success rate: {num_successes / num_total * 100:.2f}% ({num_successes} out of {num_total})") from vlfm.utils.episode_stats_logger import ( log_episode_stats, @@ -358,9 +339,7 @@ def _eval_checkpoint( aggregated_stats = {} for stat_key in next(iter(stats_episodes.values())).keys(): - aggregated_stats[stat_key] = np.mean( - [v[stat_key] for v in stats_episodes.values()] - ) + aggregated_stats[stat_key] = np.mean([v[stat_key] for v in stats_episodes.values()]) for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.4f}") @@ -369,9 +348,7 @@ def _eval_checkpoint( if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] - writer.add_scalar( - "eval_reward/average_reward", aggregated_stats["reward"], step_id - ) + writer.add_scalar("eval_reward/average_reward", aggregated_stats["reward"], step_id) metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} for k, v in metrics.items(): diff --git a/vlfm/vlm/blip2.py b/vlfm/vlm/blip2.py index 204ea07..85803ba 100644 --- a/vlfm/vlm/blip2.py +++ b/vlfm/vlm/blip2.py @@ -46,15 +46,11 @@ def ask(self, image: np.ndarray, prompt: Optional[str] = None) -> str: """ pil_img = Image.fromarray(image) with torch.inference_mode(): - processed_image = ( - self.vis_processors["eval"](pil_img).unsqueeze(0).to(self.device) - ) + processed_image = self.vis_processors["eval"](pil_img).unsqueeze(0).to(self.device) if prompt is None or prompt == "": out = self.model.generate({"image": processed_image})[0] else: - out = self.model.generate({"image": processed_image, "prompt": prompt})[ - 0 - ] + out = self.model.generate({"image": processed_image, "prompt": prompt})[0] return out diff --git a/vlfm/vlm/blip2itm.py b/vlfm/vlm/blip2itm.py index 4715759..6cb1e7d 100644 --- a/vlfm/vlm/blip2itm.py +++ b/vlfm/vlm/blip2itm.py @@ -26,13 +26,11 @@ def __init__( if device is None: device = torch.device("cuda") if torch.cuda.is_available() else "cpu" - self.model, self.vis_processors, self.text_processors = ( - load_model_and_preprocess( - name=name, - model_type=model_type, - is_eval=True, - device=device, - ) + self.model, self.vis_processors, self.text_processors = load_model_and_preprocess( + name=name, + model_type=model_type, + is_eval=True, + device=device, ) self.device = device @@ -51,9 +49,7 @@ def cosine(self, image: np.ndarray, txt: str) -> float: img = self.vis_processors["eval"](pil_img).unsqueeze(0).to(self.device) txt = self.text_processors["eval"](txt) with torch.inference_mode(): - cosine = self.model( - {"image": img, "text_input": txt}, match_head="itc" - ).item() + cosine = self.model({"image": img, "text_input": txt}, match_head="itc").item() return cosine diff --git a/vlfm/vlm/detections.py b/vlfm/vlm/detections.py index 7ad1ebc..20c823a 100644 --- a/vlfm/vlm/detections.py +++ b/vlfm/vlm/detections.py @@ -76,9 +76,7 @@ def filter_by_class(self, classes: List[str]) -> None: Args: classes (List[str]): List of classes to keep. """ - keep: torch.Tensor = torch.tensor( - [p in classes for p in self.phrases], dtype=torch.bool - ) + keep: torch.Tensor = torch.tensor([p in classes for p in self.phrases], dtype=torch.bool) self._filter(keep) def _filter(self, keep: torch.Tensor) -> None: @@ -162,9 +160,7 @@ def annotate( # If the box appears to be in normalized coordinates, de-normalize using the # image dimensions if box.max() <= 1: - box = box * np.array( - [img.shape[1], img.shape[0], img.shape[1], img.shape[0]] - ) + box = box * np.array([img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) box = box.astype(int) # Draw bounding box diff --git a/vlfm/vlm/fiber.py b/vlfm/vlm/fiber.py index e4bc35c..3b17451 100644 --- a/vlfm/vlm/fiber.py +++ b/vlfm/vlm/fiber.py @@ -14,9 +14,7 @@ class FIBER: - def __init__( - self, config_file: str = DEFAULT_CONFIG, weights: str = DEFAULT_WEIGHTS - ): + def __init__(self, config_file: str = DEFAULT_CONFIG, weights: str = DEFAULT_WEIGHTS): cfg.merge_from_file(config_file) cfg.num_gpus = 1 cfg.SOLVER.IMS_PER_BATCH = 1 @@ -32,9 +30,7 @@ def __init__( self.fiber = GLIPDemo(cfg, confidence_threshold=0.2) - def detect( - self, image: np.ndarray, phrase: str, visualize: bool = False - ) -> ObjectDetections: + def detect(self, image: np.ndarray, phrase: str, visualize: bool = False) -> ObjectDetections: """ Given an image and a phrase, this function detects the presence of the most suitable object described by the phrase in the image. The output object's @@ -57,9 +53,7 @@ def detect( """ result = self.fiber.inference(image, phrase) # Normalize result.bbox to be between 0 and 1 - normalized_bbox = result.bbox / torch.tensor( - [image.shape[1], image.shape[0], image.shape[1], image.shape[0]] - ) + normalized_bbox = result.bbox / torch.tensor([image.shape[1], image.shape[0], image.shape[1], image.shape[0]]) dets = ObjectDetections( image_source=image, @@ -76,9 +70,7 @@ class FIBERClient: def __init__(self, url: str = "http://localhost:9080/fiber"): self.url = url - def detect( - self, image: np.ndarray, phrase: str, visualize: bool = False - ) -> ObjectDetections: + def detect(self, image: np.ndarray, phrase: str, visualize: bool = False) -> ObjectDetections: response = send_request(self.url, image=image, phrase=phrase)["response"] detections = ObjectDetections.from_json(response, image_source=image) diff --git a/vlfm/vlm/grounding_dino.py b/vlfm/vlm/grounding_dino.py index cd29b8f..69aadcb 100644 --- a/vlfm/vlm/grounding_dino.py +++ b/vlfm/vlm/grounding_dino.py @@ -13,9 +13,7 @@ try: from groundingdino.util.inference import load_model, predict except ModuleNotFoundError: - print( - "Could not import groundingdino. This is OK if you are only using the client." - ) + print("Could not import groundingdino. This is OK if you are only using the client.") GROUNDING_DINO_CONFIG = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py" GROUNDING_DINO_WEIGHTS = "data/groundingdino_swint_ogc.pth" @@ -32,16 +30,12 @@ def __init__( text_threshold: float = 0.25, device: torch.device = torch.device("cuda"), ): - self.model = load_model( - model_config_path=config_path, model_checkpoint_path=weights_path - ).to(device) + self.model = load_model(model_config_path=config_path, model_checkpoint_path=weights_path).to(device) self.caption = caption self.box_threshold = box_threshold self.text_threshold = text_threshold - def predict( - self, image: np.ndarray, caption: Optional[str] = None - ) -> ObjectDetections: + def predict(self, image: np.ndarray, caption: Optional[str] = None) -> ObjectDetections: """ This function makes predictions on an input image tensor or numpy array using a pretrained model. @@ -57,9 +51,7 @@ def predict( """ # Convert image to tensor and normalize from 0-255 to 0-1 image_tensor = F.to_tensor(image) - image_transformed = F.normalize( - image_tensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] - ) + image_transformed = F.normalize(image_tensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if caption is None: caption_to_use = self.caption else: @@ -86,9 +78,7 @@ class GroundingDINOClient: def __init__(self, port: int = 12181): self.url = f"http://localhost:{port}/gdino" - def predict( - self, image_numpy: np.ndarray, caption: Optional[str] = "" - ) -> ObjectDetections: + def predict(self, image_numpy: np.ndarray, caption: Optional[str] = "") -> ObjectDetections: response = send_request(self.url, image=image_numpy, caption=caption) detections = ObjectDetections.from_json(response, image_source=image_numpy) diff --git a/vlfm/vlm/sam.py b/vlfm/vlm/sam.py index 548b6bd..fd0d7a4 100644 --- a/vlfm/vlm/sam.py +++ b/vlfm/vlm/sam.py @@ -52,9 +52,7 @@ def segment_bbox(self, image: np.ndarray, bbox: List[int]) -> np.ndarray: """ with torch.inference_mode(): self.predictor.set_image(image) - masks, _, _ = self.predictor.predict( - box=np.array(bbox), multimask_output=False - ) + masks, _, _ = self.predictor.predict(box=np.array(bbox), multimask_output=False) return masks[0] @@ -87,9 +85,7 @@ def process_payload(self, payload: dict) -> dict: cropped_mask_str = bool_arr_to_str(cropped_mask) return {"cropped_mask": cropped_mask_str} - mobile_sam = MobileSAMServer( - sam_checkpoint=os.environ.get("MOBILE_SAM_CHECKPOINT", "data/mobile_sam.pt") - ) + mobile_sam = MobileSAMServer(sam_checkpoint=os.environ.get("MOBILE_SAM_CHECKPOINT", "data/mobile_sam.pt")) print("Model loaded!") print(f"Hosting on port {args.port}...") host_model(mobile_sam, name="mobile_sam", port=args.port) diff --git a/vlfm/vlm/yolov7.py b/vlfm/vlm/yolov7.py index 0ec8c4f..809e00f 100644 --- a/vlfm/vlm/yolov7.py +++ b/vlfm/vlm/yolov7.py @@ -28,13 +28,9 @@ class YOLOv7: - def __init__( - self, weights: str, image_size: int = 640, half_precision: bool = True - ): + def __init__(self, weights: str, image_size: int = 640, half_precision: bool = True): """Loads the model and saves it to a field.""" - self.device = ( - torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - ) + self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") self.half_precision = self.device.type != "cpu" and half_precision self.model = attempt_load(weights, map_location=self.device) # load FP32 model stride = int(self.model.stride.max()) # model stride @@ -45,9 +41,7 @@ def __init__( # Warm-up if self.device.type != "cpu": - dummy_img = torch.rand( - 1, 3, int(self.image_size * 0.7), self.image_size - ).to(self.device) + dummy_img = torch.rand(1, 3, int(self.image_size * 0.7), self.image_size).to(self.device) if self.half_precision: dummy_img = dummy_img.half() for i in range(3): @@ -111,9 +105,7 @@ def predict( logits = pred[:, 4] phrases = [COCO_CLASSES[int(i)] for i in pred[:, 5]] - detections = ObjectDetections( - boxes, logits, phrases, image_source=image, fmt="xyxy" - ) + detections = ObjectDetections(boxes, logits, phrases, image_source=image, fmt="xyxy") return detections