Source code for fastdev.xform.utils

import re
from typing import List, Literal, Union, overload
import math

import numpy as np
import torch
from beartype import beartype
from jaxtyping import Float
from torch.nn import functional as F

COMMON_COORDS = {
    "opencv": "x: right, y: down, z: front",
    "opengl": "x: right, y: up, z: back",
    "sapien": "x: front, y: left, z: up",  # https://sapien.ucsd.edu/docs/latest/tutorial/basic/hello_world.html#viewer
}

# NOTE it doesn't matter if `right` correspond to [1, 0, 0], the resulted matrix is the same
DIRECTIONS = {
    "right": np.array([1, 0, 0], dtype=np.float32),
    "left": np.array([-1, 0, 0], dtype=np.float32),
    "up": np.array([0, -1, 0], dtype=np.float32),
    "down": np.array([0, 1, 0], dtype=np.float32),
    "front": np.array([0, 0, 1], dtype=np.float32),
    "back": np.array([0, 0, -1], dtype=np.float32),
}


@overload

[docs]
def coord_conversion(
    src_spec: str, dst_spec: str, check_handness: bool = True, return_tensors: Literal["pt"] = "pt"
) -> Float[torch.Tensor, "3 3"]: ...

@overload
def coord_conversion(
    src_spec: str, dst_spec: str, check_handness: bool = True, return_tensors: Literal["np"] = "np"
) -> Float[np.ndarray, "3 3"]: ...
@beartype
def coord_conversion(
    src_spec: str, dst_spec: str, check_handness: bool = True, return_tensors: Literal["np", "pt"] = "np"
) -> Union[Float[np.ndarray, "3 3"], Float[torch.Tensor, "3 3"]]:
    """
    Construct a rotation matrix based on given source and destination coordinate specifications.

    Args:
        src_spec: Source coordinate specification, e.g., "x: right, y: down, z: front" or "opencv".
        dst_spec: Destination coordinate specification, e.g., "x: right, y: up, z: back" or "opengl".
        check_handness: If True, checks if the rotation matrix preserves right-handedness.
        return_tensors: Return type of the rotation matrix, either "np" for NumPy array or "pt" for PyTorch tensor.

    Returns:
        A 3x3 rotation matrix converting coordinates from the source to the destination specification.

    Examples:
        >>> coord_conversion("opencv", "opengl")
        array([[ 1.,  0.,  0.],
               [ 0., -1.,  0.],
               [ 0.,  0., -1.]], dtype=float32)
        >>> coord_conversion("x: front, y: left, z: up", "x: left, y: up, z: front")
        array([[0., 1., 0.],
               [0., 0., 1.],
               [1., 0., 0.]], dtype=float32)
        >>> coord_conversion("x: right, y: down, z: front", "x: left, y: up, z: front")
        array([[-1.,  0.,  0.],
               [ 0., -1.,  0.],
               [ 0.,  0.,  1.]], dtype=float32)
        >>> coord_conversion("x: left, y: up, z: front", "x: front, y: left, z: up", return_tensors="pt")
        tensor([[0., 0., 1.],
                [1., 0., 0.],
                [0., 1., 0.]])
    """

    def parse_spec(spec: str) -> List[str]:
        spec = spec.strip().lower()
        if spec in COMMON_COORDS:
            coord = COMMON_COORDS[spec]
        else:
            coord = spec
        # Use regex to parse the coordinate specification
        pattern = r"\s*(x|y|z)\s*:\s*(\w+)\s*"
        matches = re.findall(pattern, coord)
        if len(matches) != 3:
            raise ValueError(f"Invalid coordinate specification: '{spec}'.")
        dirs = {axis: direction for axis, direction in matches}
        if set(dirs.keys()) != {"x", "y", "z"}:
            raise ValueError(f"Invalid coordinate specification: '{spec}'.")
        return [dirs["x"], dirs["y"], dirs["z"]]

    src_dirs = parse_spec(src_spec)
    dst_dirs = parse_spec(dst_spec)

    src_basis = np.stack([DIRECTIONS[dir] for dir in src_dirs])
    dst_basis = np.stack([DIRECTIONS[dir] for dir in dst_dirs])

    rot_mat = dst_basis @ src_basis.T

    if check_handness and np.linalg.det(rot_mat) < 0:
        raise RuntimeWarning("The rotation matrix is not right-handed.")

    if return_tensors == "pt":
        return torch.from_numpy(rot_mat).float()
    elif return_tensors == "np":
        return rot_mat.astype(np.float32)
    else:
        raise ValueError(f"Invalid return_tensors: '{return_tensors}'")



[docs]
def compose_intr_mat(fu: float, fv: float, cu: float, cv: float, skew: float = 0.0) -> np.ndarray:
    """
    Args:
        fu: horizontal focal length (width)
        fv: vertical focal length (height)
        cu: horizontal principal point (width)
        cv: vertical principal point (height)
        skew: skew coefficient, default to 0
    """
    intr_mat = np.array([[fu, skew, cu], [0.0, fv, cv], [0.0, 0.0, 1.0]], dtype=np.float32)
    return intr_mat



# Ref: https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/renderer/cameras.py
@beartype

[docs]
def look_at_rotation(
    camera_position: Float[torch.Tensor, "*batch 3"],
    at: Float[torch.Tensor, "*batch 3"],
    up: Float[torch.Tensor, "*batch 3"],
) -> Float[torch.Tensor, "*batch 3 3"]:
    """
    This function takes a vector `camera_position` which specifies the location of the camera in world coordinates and
    two vectors `at` and `up` which indicate the position of the object and the up directions of the world
    coordinate system respectively.

    The output is a rotation matrix representing the rotation from camera coordinates to world coordinates.

    We use the OpenGL coordinate in this function, i.e. x -> right, y -> up, z -> backward.
    Hence, z_axis: pos - at, x_axis: cross(up, z_axis), y_axis: cross(z_axis, x_axis)

    Note that our implementation differs from pytorch3d.
        1. our matrix is in the OpenGL coordinate
        2. our matrix is column-major
        3. our matrix is the camera-to-world transformation

    Args:
        camera_position: position of the camera in world coordinates
        at: position of the object in world coordinates
        up: vector specifying the up direction in the world coordinate frame.

    Returns:
        R: rotation matrices of shape [..., 3, 3]
    """
    dtype, device = camera_position.dtype, camera_position.device
    at, up = torch.broadcast_to(at, camera_position.shape), torch.broadcast_to(up, camera_position.shape)
    z_axis = F.normalize(camera_position - at, eps=1e-5, dim=-1)
    x_axis = F.normalize(torch.cross(up, z_axis, dim=-1), eps=1e-5, dim=-1)
    y_axis = F.normalize(torch.cross(z_axis, x_axis, dim=-1), eps=1e-5, dim=-1)
    is_close = torch.isclose(x_axis, torch.tensor(0.0, dtype=dtype, device=device), atol=5e-3)
    is_close = is_close.all(dim=-1, keepdim=True)
    if is_close.any():
        replacement = F.normalize(torch.cross(y_axis, z_axis, dim=-1), eps=1e-5)
        x_axis = torch.where(is_close, replacement, x_axis)
    rot_mat = torch.cat((x_axis[..., None, :], y_axis[..., None, :], z_axis[..., None, :]), dim=-2)
    return rot_mat.transpose(-2, -1)



Device = Union[str, torch.device]


def make_device(device: Device) -> torch.device:
    """
    Makes an actual torch.device object from the device specified as
    either a string or torch.device object. If the device is `cuda` without
    a specific index, the index of the current device is assigned.

    Args:
        device: Device (as str or torch.device)

    Returns:
        A matching torch.device object
    """
    device = torch.device(device) if isinstance(device, str) else device
    if device.type == "cuda" and device.index is None:
        # If cuda but with no index, then the current cuda device is indicated.
        # In that case, we fix to that device
        device = torch.device(f"cuda:{torch.cuda.current_device()}")
    return device


def format_tensor(
    input,
    dtype: torch.dtype = torch.float32,
    device: Device = "cpu",
) -> torch.Tensor:
    """
    Helper function for converting a scalar value to a tensor.

    Args:
        input: Python scalar, Python list/tuple, torch scalar, 1D torch tensor
        dtype: data type for the input
        device: Device (as str or torch.device) on which the tensor should be placed.

    Returns:
        input_vec: torch tensor with optional added batch dimension.
    """
    device_ = make_device(device)
    if not torch.is_tensor(input):
        input = torch.tensor(input, dtype=dtype, device=device_)

    if input.dim() == 0:
        input = input.view(1)

    if input.device == device_:
        return input

    input = input.to(device=device)
    return input


def convert_to_tensors_and_broadcast(
    *args,
    dtype: torch.dtype = torch.float32,
    device: Device = "cpu",
):
    """
    Helper function to handle parsing an arbitrary number of inputs (*args)
    which all need to have the same batch dimension.
    The output is a list of tensors.

    Args:
        *args: an arbitrary number of inputs
            Each of the values in `args` can be one of the following
                - Python scalar
                - Torch scalar
                - Torch tensor of shape (N, K_i) or (1, K_i) where K_i are
                  an arbitrary number of dimensions which can vary for each
                  value in args. In this case each input is broadcast to a
                  tensor of shape (N, K_i)
        dtype: data type to use when creating new tensors.
        device: torch device on which the tensors should be placed.

    Output:
        args: A list of tensors of shape (N, K_i)
    """
    # Convert all inputs to tensors with a batch dimension
    args_1d = [format_tensor(c, dtype, device) for c in args]

    # Find broadcast size
    sizes = [c.shape[0] for c in args_1d]
    N = max(sizes)

    args_Nd = []
    for c in args_1d:
        if c.shape[0] != 1 and c.shape[0] != N:
            msg = "Got non-broadcastable sizes %r" % sizes
            raise ValueError(msg)

        # Expand broadcast dim and keep non broadcast dims the same size
        expand_sizes = (N,) + (-1,) * len(c.shape[1:])
        args_Nd.append(c.expand(*expand_sizes))

    return args_Nd



[docs]
def camera_position_from_spherical_angles(
    distance: float,
    elevation: float,
    azimuth: float,
    degrees: bool = True,
    device: Device = "cpu",
) -> torch.Tensor:
    """
    Calculate the location of the camera based on the distance away from
    the target point, the elevation and azimuth angles.

    Args:
        distance: distance of the camera from the object.
        elevation, azimuth: angles.
            The inputs distance, elevation and azimuth can be one of the following
                - Python scalar
                - Torch scalar
                - Torch tensor of shape (N) or (1)
        degrees: bool, whether the angles are specified in degrees or radians.
        device: str or torch.device, device for new tensors to be placed on.

    The vectors are broadcast against each other so they all have shape (N, 1).

    Returns:
        camera_position: (N, 3) xyz location of the camera.
    """
    dist, elev, azim = convert_to_tensors_and_broadcast(distance, elevation, azimuth, device=device)
    if degrees:
        elev = math.pi / 180.0 * elev
        azim = math.pi / 180.0 * azim
    x = dist * torch.cos(elev) * torch.sin(azim)
    y = dist * torch.sin(elev)
    z = dist * torch.cos(elev) * torch.cos(azim)
    camera_position = torch.stack([x, y, z], dim=1)
    if camera_position.dim() == 0:
        camera_position = camera_position.view(1, -1)  # add batch dim.
    return camera_position.view(-1, 3)



__all__ = ["coord_conversion", "compose_intr_mat", "look_at_rotation", "camera_position_from_spherical_angles"]