From 98be30ed7ee343bbdd479a858a6ff548cb4ee721 Mon Sep 17 00:00:00 2001 From: Bernadette Bucher <131724239+bernadettekb@users.noreply.github.com> Date: Fri, 15 Dec 2023 12:54:20 -0500 Subject: [PATCH 1/3] mypy fixes, bug fixes, deprecated code removal (#18) Co-authored-by: Katrina Ashton --- vlfm/mapping/base_map.py | 3 +- vlfm/mapping/object_point_cloud_map.py | 4 +- vlfm/mapping/obstacle_map.py | 6 +- vlfm/mapping/traj_visualizer.py | 4 +- vlfm/policy/base_objectnav_policy.py | 4 +- vlfm/policy/reality_policies.py | 8 +- .../non_habitat_policy/nh_pointnav_policy.py | 8 +- .../policy/utils/non_habitat_policy/resnet.py | 3 +- .../non_habitat_policy/rnn_state_encoder.py | 38 ++-- vlfm/policy/utils/pointnav_policy.py | 2 +- vlfm/reality/bdsw_nav_env.py | 3 +- vlfm/reality/objectnav_env.py | 6 +- vlfm/reality/pointnav_env.py | 10 +- vlfm/reality/robots/bdsw_robot.py | 4 +- vlfm/semexp_env/semexp_policy.py | 169 ------------------ vlfm/utils/episode_stats_logger.py | 2 +- vlfm/utils/geometry_utils.py | 6 +- vlfm/vlm/detections.py | 19 +- vlfm/vlm/server_wrapper.py | 8 +- 19 files changed, 74 insertions(+), 233 deletions(-) delete mode 100644 vlfm/semexp_env/semexp_policy.py diff --git a/vlfm/mapping/base_map.py b/vlfm/mapping/base_map.py index 62f2eeb..8247481 100644 --- a/vlfm/mapping/base_map.py +++ b/vlfm/mapping/base_map.py @@ -8,10 +8,9 @@ class BaseMap: - _confidence_mask: np.ndarray = None _camera_positions: List[np.ndarray] = [] _last_camera_yaw: float = 0.0 - _map_dtype: np.dtype = np.float32 + _map_dtype: np.dtype = np.dtype(np.float32) def __init__( self, size: int = 1000, pixels_per_meter: int = 20, *args: Any, **kwargs: Any diff --git a/vlfm/mapping/object_point_cloud_map.py b/vlfm/mapping/object_point_cloud_map.py index df38d54..eb0acaa 100644 --- a/vlfm/mapping/object_point_cloud_map.py +++ b/vlfm/mapping/object_point_cloud_map.py @@ -55,7 +55,7 @@ def update_map( else: # Mark all points of local_cloud whose distance from the camera is too far # as being out of range - within_range = local_cloud[:, 0] <= max_depth * 0.95 # 5% margin + within_range = (local_cloud[:, 0] <= max_depth * 0.95) * 1.0 # 5% margin # All values of 1 in within_range will be considered within range, and all # values of 0 will be considered out of range; these 0s need to be # assigned with a random number so that they can be identified later. @@ -147,7 +147,7 @@ def update_explored( def get_target_cloud(self, target_class: str) -> np.ndarray: target_cloud = self.clouds[target_class].copy() # Determine whether any points are within range - within_range_exists: bool = np.any(target_cloud[:, -1] == 1) + within_range_exists = np.any(target_cloud[:, -1] == 1) if within_range_exists: # Filter out all points that are not within range target_cloud = target_cloud[target_cloud[:, -1] == 1] diff --git a/vlfm/mapping/obstacle_map.py b/vlfm/mapping/obstacle_map.py index 79a2962..c840347 100644 --- a/vlfm/mapping/obstacle_map.py +++ b/vlfm/mapping/obstacle_map.py @@ -1,5 +1,7 @@ # Copyright (c) 2023 Boston Dynamics AI Institute LLC. All rights reserved. +from typing import Any, Union + import cv2 import numpy as np from frontier_exploration.frontier_detection import detect_frontier_waypoints @@ -15,7 +17,7 @@ class ObstacleMap(BaseMap): and another representing the obstacles that the robot has seen so far. """ - _map_dtype: np.dtype = bool + _map_dtype: np.dtype = np.dtype(bool) _frontiers_px: np.ndarray = np.array([]) frontiers: np.ndarray = np.array([]) radius_padding_color: tuple = (100, 100, 100) @@ -52,7 +54,7 @@ def reset(self) -> None: def update_map( self, - depth: np.ndarray, + depth: Union[np.ndarray, Any], tf_camera_to_episodic: np.ndarray, min_depth: float, max_depth: float, diff --git a/vlfm/mapping/traj_visualizer.py b/vlfm/mapping/traj_visualizer.py index 4233c92..661c0cb 100644 --- a/vlfm/mapping/traj_visualizer.py +++ b/vlfm/mapping/traj_visualizer.py @@ -36,7 +36,9 @@ def draw_trajectory( img = self._draw_agent(img, camera_positions[-1], camera_yaw) return img - def _draw_path(self, img: np.ndarray, camera_positions: np.ndarray) -> np.ndarray: + def _draw_path( + self, img: np.ndarray, camera_positions: Union[np.ndarray, List[np.ndarray]] + ) -> np.ndarray: """Draws the path on the image and returns it""" if len(camera_positions) < 2: return img diff --git a/vlfm/policy/base_objectnav_policy.py b/vlfm/policy/base_objectnav_policy.py index a81da18..85a9c67 100644 --- a/vlfm/policy/base_objectnav_policy.py +++ b/vlfm/policy/base_objectnav_policy.py @@ -34,8 +34,8 @@ class BasePolicy: # type: ignore class BaseObjectNavPolicy(BasePolicy): _target_object: str = "" _policy_info: Dict[str, Any] = {} - _object_masks: np.ndarray = None # set by ._update_object_map() - _stop_action: Tensor = None # MUST BE SET BY SUBCLASS + _object_masks: Union[np.ndarray, Any] = None # set by ._update_object_map() + _stop_action: Union[Tensor, Any] = None # MUST BE SET BY SUBCLASS _observations_cache: Dict[str, Any] = {} _non_coco_caption = "" _load_yolo: bool = True diff --git a/vlfm/policy/reality_policies.py b/vlfm/policy/reality_policies.py index c650f01..6711dff 100644 --- a/vlfm/policy/reality_policies.py +++ b/vlfm/policy/reality_policies.py @@ -56,7 +56,7 @@ def from_config( def act( self: Union["RealityMixin", ITMPolicyV2], observations: Dict[str, Any], - rnn_hidden_states: Tensor, + rnn_hidden_states: Union[Tensor, Any], prev_actions: Any, masks: Tensor, deterministic: bool = False, @@ -77,10 +77,10 @@ def act( if self._done_initializing: angular = action[0][0].item() linear = action[0][1].item() - arm_yaw = -1 + arm_yaw = -1.0 else: - angular = 0 - linear = 0 + angular = 0.0 + linear = 0.0 arm_yaw = action[0][0].item() self._done_initializing = len(self._initial_yaws) == 0 diff --git a/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py b/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py index f0578e0..6373e96 100644 --- a/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py +++ b/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py @@ -49,9 +49,9 @@ class PointNavResNetNet(nn.Module): def __init__(self, discrete_actions: bool = False, no_fwd_dict: bool = False): super().__init__() if discrete_actions: - self.prev_action_embedding = nn.Embedding(4 + 1, 32) + self.prev_action_embedding_discrete = nn.Embedding(4 + 1, 32) else: - self.prev_action_embedding = nn.Linear( + self.prev_action_embedding_cont = nn.Linear( in_features=2, out_features=32, bias=True ) self.tgt_embeding = nn.Linear(in_features=3, out_features=32, bias=True) @@ -95,11 +95,11 @@ def forward( prev_actions = prev_actions.squeeze(-1) start_token = torch.zeros_like(prev_actions) # The mask means the previous action will be zero, an extra dummy action - prev_actions = self.prev_action_embedding( + prev_actions = self.prev_action_embedding_discrete( torch.where(masks.view(-1), prev_actions + 1, start_token) ) else: - prev_actions = self.prev_action_embedding(masks * prev_actions.float()) + prev_actions = self.prev_action_embedding_cont(masks * prev_actions.float()) x.append(prev_actions) diff --git a/vlfm/policy/utils/non_habitat_policy/resnet.py b/vlfm/policy/utils/non_habitat_policy/resnet.py index e94737e..e1a18c2 100755 --- a/vlfm/policy/utils/non_habitat_policy/resnet.py +++ b/vlfm/policy/utils/non_habitat_policy/resnet.py @@ -6,7 +6,7 @@ # https://github.com/facebookresearch/habitat-lab/blob/main/habitat-baselines/habitat_baselines/rl/ddppo/policy/resnet.py # This is a filtered down version that only support ResNet-18 -from typing import List, Optional, Type, cast +from typing import List, Optional, Type from torch import Tensor from torch import nn as nn @@ -149,7 +149,6 @@ def _make_layer( def forward(self, x: Tensor) -> Tensor: x = self.conv1(x) x = self.maxpool(x) - x = cast(Tensor, x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) diff --git a/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py b/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py index 30e0ea2..66e121f 100755 --- a/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py +++ b/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py @@ -6,7 +6,7 @@ # https://github.com/facebookresearch/habitat-lab/blob/main/habitat-baselines/habitat_baselines/rl/models/rnn_state_encoder.py # This is a filtered down version that only supports LSTM -from typing import Dict, Optional, Tuple +from typing import Any, Dict, Optional, Tuple import torch import torch.nn as nn @@ -21,6 +21,24 @@ class RNNStateEncoder(nn.Module): timesteps to handle episodes ending in the middle of a rollout. """ + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + ): + super().__init__() + + self.num_recurrent_layers = num_layers * 2 + + self.rnn = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + ) + + self.layer_init() + def layer_init(self) -> None: for name, param in self.rnn.named_parameters(): if "weight" in name: @@ -112,26 +130,18 @@ def __init__( hidden_size: int, num_layers: int = 1, ): - super().__init__() - - self.num_recurrent_layers = num_layers * 2 - - self.rnn = nn.LSTM( - input_size=input_size, - hidden_size=hidden_size, - num_layers=num_layers, - ) - - self.layer_init() + super().__init__(input_size, hidden_size, num_layers) + # Note: Type handling mypy errors in pytorch libraries prevent + # directly setting hidden_states type def pack_hidden( - self, hidden_states: Tuple[torch.Tensor, torch.Tensor] + self, hidden_states: Any # type is Tuple[torch.Tensor, torch.Tensor] ) -> torch.Tensor: return torch.cat(hidden_states, 0) def unpack_hidden( self, hidden_states: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> Any: # type is Tuple[torch.Tensor, torch.Tensor] lstm_states = torch.chunk(hidden_states.contiguous(), 2, 0) return (lstm_states[0], lstm_states[1]) diff --git a/vlfm/policy/utils/pointnav_policy.py b/vlfm/policy/utils/pointnav_policy.py index b05b55d..002de45 100644 --- a/vlfm/policy/utils/pointnav_policy.py +++ b/vlfm/policy/utils/pointnav_policy.py @@ -203,7 +203,7 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: def move_obs_to_device( - observations: Dict[str, Union[Tensor, np.ndarray]], + observations: Dict[str, Any], device: torch.device, unsqueeze: bool = False, ) -> Dict[str, Tensor]: diff --git a/vlfm/reality/bdsw_nav_env.py b/vlfm/reality/bdsw_nav_env.py index 3072ee6..93cd6d5 100644 --- a/vlfm/reality/bdsw_nav_env.py +++ b/vlfm/reality/bdsw_nav_env.py @@ -16,8 +16,9 @@ def run_env( done = False mask = torch.zeros(1, 1, device=policy.device, dtype=torch.bool) action = policy.act(observations, mask) + action_dict = {"rho_theta": action} while not done: - observations, _, done, info = env.step(action) + observations, _, done, info = env.step(action_dict) action = policy.act(observations, mask, deterministic=True) mask = torch.ones_like(mask) diff --git a/vlfm/reality/objectnav_env.py b/vlfm/reality/objectnav_env.py index a8b2264..78b9cfa 100644 --- a/vlfm/reality/objectnav_env.py +++ b/vlfm/reality/objectnav_env.py @@ -59,7 +59,7 @@ def __init__(self, max_gripper_cam_depth: float, *args: Any, **kwargs: Any) -> N self._vis_dir = f"{date_string}" os.makedirs(f"vis/{self._vis_dir}", exist_ok=True) - def reset(self, goal: Any, *args: Any, **kwargs: Any) -> Dict[str, np.ndarray]: + def reset(self, goal: Any, *args: Any, **kwargs: Any) -> Dict[str, Any]: assert isinstance(goal, str) self.target_object = goal # Transformation matrix from where the robot started to the global frame @@ -69,7 +69,7 @@ def reset(self, goal: Any, *args: Any, **kwargs: Any) -> Dict[str, np.ndarray]: self.episodic_start_yaw = self.robot.xy_yaw[1] return self._get_obs() - def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: + def step(self, action: Dict[str, Any]) -> Tuple[Dict, float, bool, Dict]: # Parent class only moves the base; if we want to move the gripper camera, # we need to do it here vis_imgs = [] @@ -117,7 +117,7 @@ def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: return self._get_obs(), 0.0, done, {} # not using info dict yet - def _get_obs(self) -> Dict[str, np.ndarray]: + def _get_obs(self) -> Dict[str, Any]: robot_xy, robot_heading = self._get_gps(), self._get_compass() nav_depth, obstacle_map_depths, value_map_rgbd, object_map_rgbd = ( self._get_camera_obs() diff --git a/vlfm/reality/pointnav_env.py b/vlfm/reality/pointnav_env.py index 191ee96..ee5dfbf 100644 --- a/vlfm/reality/pointnav_env.py +++ b/vlfm/reality/pointnav_env.py @@ -54,7 +54,7 @@ def reset( self.goal = goal return self._get_obs() - def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: + def step(self, action: Dict[str, Any]) -> Tuple[Dict, float, bool, Dict]: if self._cmd_id is not None: cmd_status = 0 while cmd_status != 1: @@ -100,15 +100,13 @@ def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: self._num_steps += 1 return self._get_obs(), 0.0, done, {} # not using info dict yet - def _compute_velocities(self, action: Dict[str, np.ndarray]) -> Tuple[float, float]: + def _compute_velocities(self, action: Dict[str, Any]) -> Tuple[float, float]: ang_dist, lin_dist = self._compute_displacements(action) ang_vel = ang_dist / self._time_step lin_vel = lin_dist / self._time_step return ang_vel, lin_vel - def _compute_displacements( - self, action: Dict[str, np.ndarray] - ) -> Tuple[float, float]: + def _compute_displacements(self, action: Dict[str, Any]) -> Tuple[float, float]: displacements = [] for action_key, max_dist in ( ("angular", self._max_ang_dist), @@ -120,7 +118,7 @@ def _compute_displacements( act_val = action[action_key] dist = np.clip(act_val, -1.0, 1.0) # clip to [-1, 1] dist *= max_dist # scale to max distance - displacements.append(dist) # convert to velocity + displacements.append(dist) # convert to velocities ang_dist, lin_dist = displacements return ang_dist, lin_dist diff --git a/vlfm/reality/robots/bdsw_robot.py b/vlfm/reality/robots/bdsw_robot.py index aa81e96..3b958e5 100644 --- a/vlfm/reality/robots/bdsw_robot.py +++ b/vlfm/reality/robots/bdsw_robot.py @@ -89,7 +89,7 @@ def open_gripper(self) -> None: """Opens the gripper""" self.spot.open_gripper() - def get_camera_data(self, srcs: List[str]) -> Dict[str, np.ndarray]: + def get_camera_data(self, srcs: List[str]) -> Dict[str, Dict[str, Any]]: """Returns a dict that maps each camera id to its image, focal lengths, and transform matrix (from camera to global frame). @@ -106,7 +106,7 @@ def get_camera_data(self, srcs: List[str]) -> Dict[str, np.ndarray]: } return imgs - def _camera_response_to_data(self, response: Any) -> Dict[str, np.ndarray]: + def _camera_response_to_data(self, response: Any) -> Dict[str, Any]: image: np.ndarray = image_response_to_cv2(response, reorient=False) fx: float = response.source.pinhole.intrinsics.focal_length.x fy: float = response.source.pinhole.intrinsics.focal_length.y diff --git a/vlfm/semexp_env/semexp_policy.py b/vlfm/semexp_env/semexp_policy.py deleted file mode 100644 index 3571219..0000000 --- a/vlfm/semexp_env/semexp_policy.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) 2023 Boston Dynamics AI Institute LLC. All rights reserved. - -from typing import Any, Dict, Tuple, Union - -import numpy as np -import torch -from depth_camera_filtering import filter_depth -from torch import Tensor - -from vlfm.mapping.obstacle_map import ObstacleMap -from vlfm.policy.base_objectnav_policy import BaseObjectNavPolicy -from vlfm.policy.itm_policy import ITMPolicy, ITMPolicyV2, ITMPolicyV3 -from vlfm.utils.geometry_utils import xyz_yaw_to_tf_matrix -from vlfm.vlm.grounding_dino import ObjectDetections - - -class TorchActionIDs: - STOP = torch.tensor([[0]], dtype=torch.long) - MOVE_FORWARD = torch.tensor([[1]], dtype=torch.long) - TURN_LEFT = torch.tensor([[2]], dtype=torch.long) - TURN_RIGHT = torch.tensor([[3]], dtype=torch.long) - - -class SemExpMixin: - """This Python mixin only contains code relevant for running a BaseObjectNavPolicy - explicitly within Habitat (vs. the real world, etc.) and will endow any parent class - (that is a subclass of BaseObjectNavPolicy) with the necessary methods to run in - Habitat. - """ - - _stop_action: Tensor = TorchActionIDs.STOP - _start_yaw: Union[float, None] = None # must be set by _reset() method - _observations_cache: Dict[str, Any] = {} - _policy_info: Dict[str, Any] = {} - - def __init__( - self: Union["SemExpMixin", BaseObjectNavPolicy], - camera_height: float, - min_depth: float, - max_depth: float, - camera_fov: float, - image_width: int, - *args: Any, - **kwargs: Any, - ) -> None: - super().__init__(*args, **kwargs) # type: ignore - assert self._compute_frontiers, "Must set self._compute_frontiers = True" - self._camera_height = camera_height - self._min_depth = min_depth - self._max_depth = max_depth - camera_fov_rad = np.deg2rad(camera_fov) - self._camera_fov = camera_fov_rad - self._fx = self._fy = image_width / (2 * np.tan(camera_fov_rad / 2)) - - self._compute_frontiers: bool = super()._compute_frontiers # type: ignore - - def act( - self: Union["SemExpMixin", BaseObjectNavPolicy], - observations: Dict[str, Union[Tensor, str]], - rnn_hidden_states: Any, - prev_actions: Any, - masks: Tensor, - deterministic: bool = True, - ) -> Tuple[Tensor, Dict[str, Any]]: - """Converts object ID to string name, returns action as PolicyActionData""" - parent_cls: BaseObjectNavPolicy = super() # type: ignore - try: - action, rnn_hidden_states = parent_cls.act( - observations, None, None, masks, deterministic - ) - except StopIteration: - action = self._stop_action - return action, self._policy_info - - def _initialize(self) -> Tensor: - """Turn left 30 degrees 12 times to get a 360 view at the beginning""" - self._done_initializing = not self._num_steps < 11 # type: ignore - return TorchActionIDs.TURN_LEFT - - def _reset(self) -> None: - parent_cls: BaseObjectNavPolicy = super() # type: ignore - parent_cls._reset() - self._start_yaw = None - - def _get_policy_info(self, detections: ObjectDetections) -> Dict[str, Any]: - """Get policy info for logging""" - parent_cls: BaseObjectNavPolicy = super() # type: ignore - info = parent_cls._get_policy_info(detections) - - if not self._visualize: # type: ignore - return info - - if self._start_yaw is None: - self._start_yaw = self._observations_cache["habitat_start_yaw"] - info["start_yaw"] = self._start_yaw - return info - - def _cache_observations( - self: Union["SemExpMixin", BaseObjectNavPolicy], observations: Dict[str, Any] - ) -> None: - """Caches the rgb, depth, and camera transform from the observations. - - Args: - observations (TensorDict): The observations from the current timestep. - """ - if len(self._observations_cache) > 0: - return - rgb = observations["rgb"][0].cpu().numpy() - depth = observations["depth"][0].cpu().numpy() - x, y = observations["gps"][0].cpu().numpy() - camera_yaw = observations["compass"][0].cpu().item() - depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None) - # Habitat GPS makes west negative, so flip y - camera_position = np.array([x, -y, self._camera_height]) - robot_xy = camera_position[:2] - tf_camera_to_episodic = xyz_yaw_to_tf_matrix(camera_position, camera_yaw) - - self._obstacle_map: ObstacleMap - self._obstacle_map.update_map( - depth, - tf_camera_to_episodic, - self._min_depth, - self._max_depth, - self._fx, - self._fy, - self._camera_fov, - ) - frontiers = self._obstacle_map.frontiers - self._obstacle_map.update_agent_traj(robot_xy, camera_yaw) - self._observations_cache = { - "frontier_sensor": frontiers, - "nav_depth": observations["depth"], # for pointnav - "robot_xy": robot_xy, - "robot_heading": camera_yaw, - "object_map_rgbd": [ - ( - rgb, - depth, - tf_camera_to_episodic, - self._min_depth, - self._max_depth, - self._fx, - self._fy, - ) - ], - "value_map_rgbd": [ - ( - rgb, - depth, - tf_camera_to_episodic, - self._min_depth, - self._max_depth, - self._camera_fov, - ) - ], - "habitat_start_yaw": observations["heading"][0].item(), - } - - -class SemExpITMPolicy(SemExpMixin, ITMPolicy): - pass - - -class SemExpITMPolicyV2(SemExpMixin, ITMPolicyV2): - pass - - -class SemExpITMPolicyV3(SemExpMixin, ITMPolicyV3): - pass diff --git a/vlfm/utils/episode_stats_logger.py b/vlfm/utils/episode_stats_logger.py index 4b31da9..892180c 100644 --- a/vlfm/utils/episode_stats_logger.py +++ b/vlfm/utils/episode_stats_logger.py @@ -77,7 +77,7 @@ def was_target_seen(infos: Dict[str, Any]) -> bool: explored_area = infos["top_down_map"]["fog_of_war_mask"] # Dilate the target_bboxes_mask by 10 pixels to add a margin of error target_bboxes_mask = cv2.dilate(target_bboxes_mask, np.ones((10, 10))) - target_explored = np.any(np.logical_and(explored_area, target_bboxes_mask)) + target_explored = bool(np.any(np.logical_and(explored_area, target_bboxes_mask))) return target_explored diff --git a/vlfm/utils/geometry_utils.py b/vlfm/utils/geometry_utils.py index e03e7e9..9a66882 100644 --- a/vlfm/utils/geometry_utils.py +++ b/vlfm/utils/geometry_utils.py @@ -1,7 +1,7 @@ # Copyright (c) 2023 Boston Dynamics AI Institute LLC. All rights reserved. import math -from typing import Tuple, Union +from typing import Tuple import numpy as np @@ -33,7 +33,7 @@ def rho_theta( rho = np.linalg.norm(local_goal) theta = np.arctan2(local_goal[1], local_goal[0]) - return rho, theta + return float(rho), float(theta) def get_rotation_matrix(angle: float, ndims: int = 2) -> np.ndarray: @@ -58,7 +58,7 @@ def get_rotation_matrix(angle: float, ndims: int = 2) -> np.ndarray: raise ValueError("ndims must be 2 or 3") -def wrap_heading(theta: Union[float, np.ndarray]) -> Union[float, np.ndarray]: +def wrap_heading(theta: float) -> float: """Wraps given angle to be between -pi and pi. Args: diff --git a/vlfm/vlm/detections.py b/vlfm/vlm/detections.py index 9dbae44..3ec1917 100644 --- a/vlfm/vlm/detections.py +++ b/vlfm/vlm/detections.py @@ -23,7 +23,7 @@ def __init__( boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str], - image_source: Optional[np.ndarray] = None, + image_source: Optional[np.ndarray], fmt: str = "cxcywh", ): self.image_source = image_source @@ -36,8 +36,8 @@ def __init__( self._annotated_frame: Optional[np.ndarray] = None @property - def annotated_frame(self) -> np.ndarray: - if self._annotated_frame is None: + def annotated_frame(self) -> Optional[np.ndarray]: + if self._annotated_frame is None and self.image_source is not None: self._annotated_frame = annotate( image_source=self.image_source, boxes=self.boxes, @@ -67,7 +67,7 @@ def filter_by_conf(self, conf_thresh: float) -> None: Args: conf_thresh (float): Confidence threshold to filter detections. """ - keep: torch.Tensor[bool] = torch.ge(self.logits, conf_thresh) # >= + keep: torch.Tensor = torch.ge(self.logits, conf_thresh) # >= self._filter(keep) def filter_by_class(self, classes: List[str]) -> None: @@ -76,7 +76,7 @@ def filter_by_class(self, classes: List[str]) -> None: Args: classes (List[str]): List of classes to keep. """ - keep: torch.Tensor[bool] = torch.tensor( + keep: torch.Tensor = torch.tensor( [p in classes for p in self.phrases], dtype=torch.bool ) self._filter(keep) @@ -215,15 +215,16 @@ def draw_bounding_box( if color is None: # Randomly choose a color from the rainbow colormap (so boxes aren't black) - single_pixel = np.array([[np.random.randint(0, 256)]], dtype=np.uint8) + single_pixel = np.array([[np.random.randint(0, 256)]]) single_pixel = cv2.applyColorMap(single_pixel, cv2.COLORMAP_RAINBOW) # reshape to a single dimensional array - color = single_pixel.reshape(3) + rand_color = single_pixel.reshape(3) + color = [int(c) for c in rand_color] # type: ignore else: # Convert RGB to BGR - color = color[::-1] - color = [int(c) for c in color] # type: ignore + bgr_color = color[::-1] + color = [int(c) for c in bgr_color] # type: ignore # Draw bounding box on image box_thickness = 2 diff --git a/vlfm/vlm/server_wrapper.py b/vlfm/vlm/server_wrapper.py index 32d2d00..344057a 100644 --- a/vlfm/vlm/server_wrapper.py +++ b/vlfm/vlm/server_wrapper.py @@ -37,8 +37,7 @@ def process_request() -> Dict[str, Any]: def bool_arr_to_str(arr: np.ndarray) -> str: """Converts a boolean array to a string.""" - packed = np.packbits(arr) - packed_str = base64.b64encode(packed).decode() + packed_str = base64.b64encode(arr.tobytes()).decode() return packed_str @@ -50,9 +49,8 @@ def str_to_bool_arr(s: str, shape: tuple) -> np.ndarray: # Convert bytes to np.uint8 array bytes_array = np.frombuffer(bytes_, dtype=np.uint8) - # Unpack the bytes back into a boolean array - unpacked = np.unpackbits(bytes_array) - unpacked = unpacked.reshape(shape) + # Reshape the data back into a boolean array + unpacked = bytes_array.reshape(shape) return unpacked From 77cf94da077f70ddc0cf34b72752f69827a98ede Mon Sep 17 00:00:00 2001 From: Katrina Ashton Date: Fri, 15 Dec 2023 14:20:37 -0500 Subject: [PATCH 2/3] Reality changes. Match laptop config for reality, copy over required file, allow loading in pointnav weight checkpoint used in previous experiments --- config/experiments/reality.yaml | 4 ++-- vlfm/policy/utils/pointnav_policy.py | 9 ++++++++- vlfm/reality/robots/frame_ids.py | 8 ++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 vlfm/reality/robots/frame_ids.py diff --git a/config/experiments/reality.yaml b/config/experiments/reality.yaml index 632851f..4c213e0 100644 --- a/config/experiments/reality.yaml +++ b/config/experiments/reality.yaml @@ -10,7 +10,7 @@ policy: name: "RealityITMPolicyV2" pointnav_policy_path: "data/pointnav_weights.pth" depth_image_shape: [212, 240] # height, width - pointnav_stop_radius: 1.2 + pointnav_stop_radius: 0.9 use_max_confidence: False object_map_erosion_size: 5 exploration_thresh: 0.0 @@ -21,7 +21,7 @@ policy: agent_radius: 0.2 env: - max_body_cam_depth: 3.5 + max_body_cam_depth: 2.5 max_gripper_cam_depth: 5.0 max_lin_dist: 0.2 max_ang_dist: 0.523599 diff --git a/vlfm/policy/utils/pointnav_policy.py b/vlfm/policy/utils/pointnav_policy.py index 002de45..9a23177 100644 --- a/vlfm/policy/utils/pointnav_policy.py +++ b/vlfm/policy/utils/pointnav_policy.py @@ -190,7 +190,14 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: else: ckpt_dict = torch.load(file_path, map_location="cpu") pointnav_policy = PointNavResNetTensorOutputPolicy() - current_state_dict = pointnav_policy.state_dict() + current_state_dict = pointnav_policy.state_dict() + #Let old checkpoints work with new code + if 'net.prev_action_embedding_cont.bias' not in ckpt_dict.keys(): + ckpt_dict['net.prev_action_embedding_cont.bias'] = ckpt_dict['net.prev_action_embedding.bias'] + if 'net.prev_action_embedding_cont.weights' not in ckpt_dict.keys(): + ckpt_dict['net.prev_action_embedding_cont.weight'] = ckpt_dict['net.prev_action_embedding.weight'] + + pointnav_policy.load_state_dict( {k: v for k, v in ckpt_dict.items() if k in current_state_dict} ) diff --git a/vlfm/reality/robots/frame_ids.py b/vlfm/reality/robots/frame_ids.py new file mode 100644 index 0000000..7da70ec --- /dev/null +++ b/vlfm/reality/robots/frame_ids.py @@ -0,0 +1,8 @@ +class SpotFrameIds: + BODY: str = "body" + FLAT_BODY: str = "flat_body" + GPE: str = "gpe" + HAND: str = "hand" + LINK_WR1: str = "link_wr1" + ODOM: str = "odom" + VISION: str = "vision" From 8365ad0d0384a5e2ad75dfb5bd2b0e45f92e34fa Mon Sep 17 00:00:00 2001 From: Katrina Ashton Date: Fri, 15 Dec 2023 16:22:43 -0500 Subject: [PATCH 3/3] Working in real world --- vlfm/policy/reality_policies.py | 38 +++++++++++++++------------------ vlfm/vlm/detections.py | 9 ++++---- vlfm/vlm/server_wrapper.py | 8 ++++--- 3 files changed, 26 insertions(+), 29 deletions(-) diff --git a/vlfm/policy/reality_policies.py b/vlfm/policy/reality_policies.py index 6711dff..00f028d 100644 --- a/vlfm/policy/reality_policies.py +++ b/vlfm/policy/reality_policies.py @@ -75,36 +75,32 @@ def act( # convert this numpy array to a dictionary with keys "angular" and "linear" so # that it can be passed to the Spot robot. if self._done_initializing: - angular = action[0][0].item() - linear = action[0][1].item() - arm_yaw = -1.0 + action_dict = { + "angular": action[0][0].item(), + "linear": action[0][1].item(), + "arm_yaw": -1, + "info": self._policy_info, + } else: - angular = 0.0 - linear = 0.0 - arm_yaw = action[0][0].item() + action_dict = { + "angular": 0, + "linear": 0, + "arm_yaw": action[0][0].item(), + "info": self._policy_info, + } + + if "rho_theta" in self._policy_info: + action_dict["rho_theta"] = self._policy_info["rho_theta"] self._done_initializing = len(self._initial_yaws) == 0 - action = torch.tensor([[angular, linear, arm_yaw]], dtype=torch.float32) + return action_dict - return action, rnn_hidden_states def get_action( self, observations: Dict[str, Any], masks: Tensor, deterministic: bool = True ) -> Dict[str, Any]: - actions, _ = self.act( - observations, None, None, masks, deterministic=deterministic - ) - action_dict = { - "angular": actions[0], - "linear": actions[1], - "arm_yaw": actions[2], - "info": self._policy_info, - } - if "rho_theta" in self._policy_info: - action_dict["rho_theta"] = self._policy_info["rho_theta"] - - return action_dict + return self.act(observations, None, None, masks, deterministic=deterministic) def _reset(self: Union["RealityMixin", ITMPolicyV2]) -> None: parent_cls: ITMPolicyV2 = super() # type: ignore diff --git a/vlfm/vlm/detections.py b/vlfm/vlm/detections.py index 3ec1917..9f87b1b 100644 --- a/vlfm/vlm/detections.py +++ b/vlfm/vlm/detections.py @@ -215,16 +215,15 @@ def draw_bounding_box( if color is None: # Randomly choose a color from the rainbow colormap (so boxes aren't black) - single_pixel = np.array([[np.random.randint(0, 256)]]) + single_pixel = np.array([[np.random.randint(0, 256)]], dtype=np.uint8) single_pixel = cv2.applyColorMap(single_pixel, cv2.COLORMAP_RAINBOW) # reshape to a single dimensional array - rand_color = single_pixel.reshape(3) - color = [int(c) for c in rand_color] # type: ignore + color = single_pixel.reshape(3) else: # Convert RGB to BGR - bgr_color = color[::-1] - color = [int(c) for c in bgr_color] # type: ignore + color = color[::-1] + color = [int(c) for c in color] # Draw bounding box on image box_thickness = 2 diff --git a/vlfm/vlm/server_wrapper.py b/vlfm/vlm/server_wrapper.py index 344057a..32d2d00 100644 --- a/vlfm/vlm/server_wrapper.py +++ b/vlfm/vlm/server_wrapper.py @@ -37,7 +37,8 @@ def process_request() -> Dict[str, Any]: def bool_arr_to_str(arr: np.ndarray) -> str: """Converts a boolean array to a string.""" - packed_str = base64.b64encode(arr.tobytes()).decode() + packed = np.packbits(arr) + packed_str = base64.b64encode(packed).decode() return packed_str @@ -49,8 +50,9 @@ def str_to_bool_arr(s: str, shape: tuple) -> np.ndarray: # Convert bytes to np.uint8 array bytes_array = np.frombuffer(bytes_, dtype=np.uint8) - # Reshape the data back into a boolean array - unpacked = bytes_array.reshape(shape) + # Unpack the bytes back into a boolean array + unpacked = np.unpackbits(bytes_array) + unpacked = unpacked.reshape(shape) return unpacked