' + + '' + + _("Hide Search Matches") + + "
" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/environments.html b/environments.html new file mode 100644 index 0000000..9ce09c8 --- /dev/null +++ b/environments.html @@ -0,0 +1,117 @@ + + + + + + +TartanAir V2 is a flexible dataset when used with this Python package. Using it, you can download, iterate, and modify the raw data. Here are some examples of what you can do with it.
+import tartanair as ta
+
+# Initialize TartanAir.
+tartanair_data_root = '/my/path/to/root/folder/for/tartanair-v2'
+ta.init(tartanair_data_root)
+
+# Download a trajectory.
+ta.download(env = "ArchVizTinyHouseDay",
+ difficulty = ['easy'], # this can be 'easy', and/or 'hard'
+ modality = ['image', 'depth', 'seg', 'imu'], # available modalities are: image', 'depth', 'seg', 'imu', 'lidar', 'flow', 'pose'
+ camera_name = ['lcam_front', 'lcam_left', 'lcam_right', 'lcam_back', 'lcam_top', 'lcam_bottom'],
+ unzip = True) # unzip files autonomously after download
+
ta.download(config = 'download_config.yaml')
+
The config file if of the following format:
+env: ['ArchVizTinyHouseDay']
+difficulty: ['easy']
+modality: ['image', 'depth']
+camera_name: ['lcam_front', 'lcam_left', 'lcam_right', 'lcam_back', 'lcam_top', 'lcam_bottom']
+unzip: True
+
TartanAir V2 allows you to synthesize your own dataset by modifying the raw data. For example, by specifying a new camera model and generating images using it.
+import tartanair as ta
+
+# For help with rotations.
+from scipy.spatial.transform import Rotation
+
+# Initialize TartanAir.
+tartanair_data_root = '/my/path/to/root/folder/for/tartanair-v2'
+ta.init(tartanair_data_root)
+
+# Create your camera model(s).
+R_raw_new0 = Rotation.from_euler('y', 90, degrees=True).as_matrix().tolist()
+
+cam_model_0 = {'name': 'pinhole',
+ 'raw_side': 'left', # TartanAir has two cameras, one on the left and one on the right. This parameter specifies which camera to use.
+ 'params':
+ {'fx': 320,
+ 'fy': 320,
+ 'cx': 320,
+ 'cy': 320,
+ 'width': 640,
+ 'height': 640},
+ 'R_raw_new': R_raw_new0}
+
+R_raw_new1 = Rotation.from_euler('xyz', [45, 0, 0], degrees=True).as_matrix().tolist()
+
+cam_model_1 = {'name': 'doublesphere',
+ 'raw_side': 'left',
+ 'params':
+ {'fx': 300,
+ 'fy': 300,
+ 'cx': 500,
+ 'cy': 500,
+ 'width': 1000,
+ 'height': 1000,
+ 'alpha': 0.6,
+ 'xi': -0.2,
+ 'fov_degree': 195},
+ 'R_raw_new': R_raw_new1}
+
+# Customize the dataset.
+ta.customize(env = 'ArchVizTinyHouseDay',
+ difficulty = 'easy',
+ trajectory_id = ['P000'],
+ modality = ['image', 'depth'],
+ new_camera_models_params=[cam_model_1, cam_model_0],
+ num_workers = 4,
+ device = "cuda") # or cpu
+
TartanAir-V2 includes a powerful parallelized dataloader. It can be used to load data from the dataset and serve mini-batches in parallel, and also to apply (some) transformations to the data on the fly. We highly recommend that you use it for efficient data loading.
+import tartanair as ta
+import numpy as np
+import cv2
+
+# Initialize TartanAir.
+tartanair_data_root = '/my/path/to/root/folder/for/tartanair-v2'
+ta.init(tartanair_data_root)
+
+# Specify the environments, difficulties, and trajectory ids to load.
+envs = ['ArchVizTinyHouseDay']
+difficulties = ['easy']
+trajectory_ids = ['P000', 'P001']
+
+# Specify the modalities to load.
+modalities = ['image', 'pose', 'imu']
+camnames = ['lcam_front', 'lcam_left', 'lcam_right', 'lcam_back', 'lcam_top', 'lcam_bottom']
+
+# Specify the dataloader parameters.
+new_image_shape_hw = [640, 640] # If None, no resizing is performed. If a value is passed, then the image is resized to this shape.
+subset_framenum = 200 # This is the number of frames in a subset. Notice that this is an upper bound on the batch size. Ideally, make this number large to utilize your RAM efficiently. Information about the allocated memory will be provided in the console.
+seq_length = {'image': 2, 'pose': 2, 'imu': 10} # This is the length of the data-sequences. For example, if the sequence length is 2, then the dataloader will load pairs of images.
+seq_stride = 1 # This is the stride between the data-sequences. For example, if the sequence length is 2 and the stride is 1, then the dataloader will load pairs of images [0,1], [1,2], [2,3], etc. If the stride is 2, then the dataloader will load pairs of images [0,1], [2,3], [4,5], etc.
+frame_skip = 0 # This is the number of frames to skip between each frame. For example, if the frame skip is 2 and the sequence length is 3, then the dataloader will load frames [0, 3, 6], [1, 4, 7], [2, 5, 8], etc.
+batch_size = 8 # This is the number of data-sequences in a mini-batch.
+num_workers = 4 # This is the number of workers to use for loading the data.
+shuffle = True # Whether to shuffle the data. Let's set this to False for now, so that we can see the data loading in a nice video. Yes it is nice don't argue with me please. Just look at it! So nice. :)
+
+# Create a dataloader object.
+dataloader = ta.dataloader(env = envs,
+ difficulty = difficulties,
+ trajectory_id = trajectory_ids,
+ modality = modalities,
+ camera_name = camnames,
+ new_image_shape_hw = new_image_shape_hw,
+ seq_length = seq_length,
+ subset_framenum = subset_framenum,
+ seq_stride = seq_stride,
+ frame_skip = frame_skip,
+ batch_size = batch_size,
+ num_workers = num_workers,
+ shuffle = shuffle,
+ verbose = True)
+
+# Iterate over the batches.
+for i in range(100):
+ # Get the next batch.
+ batch = dataloader.load_sample()
+ # Visualize some images.
+ # The shape of an image batch is (B, S, H, W, C), where B is the batch size, S is the sequence length, H is the height, W is the width, and C is the number of channels.
+
+ print("Batch number: {}".format(i+1), "Loaded {} samples so far.".format((i+1) * batch_size))
+
+ for b in range(batch_size):
+
+ # Create image cross.
+ left = batch['image_lcam_left'][b][0].numpy().transpose(1,2,0)
+ front = batch['image_lcam_front'][b][0].numpy().transpose(1,2,0)
+ right = batch['image_lcam_right'][b][0].numpy().transpose(1,2,0)
+ back = batch['image_lcam_back'][b][0].numpy().transpose(1,2,0)
+ top = batch['image_lcam_top'][b][0].numpy().transpose(1,2,0)
+ bottom = batch['image_lcam_bottom'][b][0].numpy().transpose(1,2,0)
+ cross_mid = np.concatenate([left, front, right, back], axis=1)
+ cross_top = np.concatenate([np.zeros_like(top), top, np.zeros_like(top), np.zeros_like(top)], axis=1)
+ cross_bottom = np.concatenate([np.zeros_like(bottom), bottom, np.zeros_like(bottom), np.zeros_like(bottom)], axis=1)
+ cross = np.concatenate([cross_top, cross_mid, cross_bottom], axis=0)
+
+ pose = batch['pose_lcam_front'].numpy()
+ imu = batch['imu'].numpy()
+
+ # Resize.
+ cross = cv2.resize(cross, (cross.shape[1]//4, cross.shape[0]//4))
+
+ # Show the image cross.
+ cv2.imshow('cross', cross)
+ cv2.waitKey(100)
+
+ print(" Pose: ", pose[0][0])
+ print(" IMU: ", imu[0][0])
+
+dataloader.stop_cachers()
+
Create a data iterator to get samples from the TartanAir V2 dataset. The samples include data in the specified modalities.
+import tartanair as ta
+
+# Initialize TartanAir.
+tartanair_data_root = '/my/path/to/root/folder/for/tartanair-v2'
+ta.init(tartanair_data_root)
+
+# Create iterator.
+ta_iterator = ta.iterator(env = ['ArchVizTinyHouseDay'],
+ difficulty = 'easy',
+ trajectory_id = [],
+ modality = 'image',
+ camera_name = ['lcam_left'])
+
+for i in range(100):
+ sample = next(ta_iterator)
+
TartanAir also provides tools for evaluating estimated trajectories against the ground truth. The evaluation is based on the ATE and RPE metrics, which can be computed for the entire trajectory, a subset of the trajectory, and also a scaled and shifted version of the estimated trajectory that matched the ground truth better, if that is requested.
+import tartanair as ta
+import numpy as np
+
+# Initialize TartanAir.
+tartanair_data_root = '/my/path/to/root/folder/for/tartanair-v2'
+ta.init(tartanair_data_root)
+
+# Create an example trajectory. This is a noisy version of the ground truth trajectory.
+env = 'ArchVizTinyHouseDay'
+difficulty = 'easy'
+trajectory_id = 'P002'
+camera_name = 'lcam_front'
+gt_traj = ta.get_traj_np(env, difficulty, trajectory_id, camera_name)
+est_traj = gt_traj + np.random.normal(0, 0.1, gt_traj.shape)
+
+# Pass the ground truth trajectory directly to the evaluation function.
+results = ta.evaluate_traj(est_traj,
+ gt_traj = gt_traj,
+ enforce_length = True,
+ plot = True,
+ plot_out_path = plot_out_path,
+ do_scale = True,
+ do_align = True)
+
+# Or pass the environment, difficulty, and trajectory id to the evaluation function.
+plot_out_path = "evaluator_example_plot.png"
+results = ta.evaluate_traj(est_traj,
+ env = env,
+ difficulty = difficulty,
+ trajectory_id = trajectory_id,
+ camera_name = camera_name,
+ enforce_length = True,
+ plot = True,
+ plot_out_path = plot_out_path,
+ do_scale = True,
+ do_align = True)
+
TartanAir V2 also includes a power dense correspondense sampling tool that can be used to calculate desnse correspondences between any points in the same environment.
+The tool supports sampling dense correspondence between any combination of pinhole, fisheye(doublesphere), and equirectangular cameras. Yes, you can sample dense correspondences between different camera models such as pinhole and equirectangular.
+Given a pair of RGB and depth image cubes and two direction, the tool will compute dense correspondense represented as optical flow and a occlusion mask signaling is the pixel directly observable.
+Please refer to flow_sampling_example.py for a complete example.
+ +To learn more about how the resampling happens, see Flow Sampling.
+ +Given a pair of images, computing the dense correspondence is to compute which pixel location in the second image correspond to each pixel location in the first image, and the dense confidence, or the probability that the pair is occluded. +This is a fundamental problem in computer vision and has many applications such as optical flow, stereo matching, and image registration.
+Based on the need for accurate, pixel-level correspondance information for the Match-Anything project, we select to leverage existing datasets that contains accurate camera intrinsics/extrinsics, and depth map.
+However, the real problem comes from how we store depth - by depth maps. Therefore, we only have depth information at discrete points that does not capture the full information. For example, when reprojecting image points from the first image onto the second image, it will almost always land onto a non-integer coordinate, which we do not have the depth information stored. Or, in other words we only have points at integer pixel coordinate of both images, and it is unlikely that two points will coincide in the 3D space.
+Given the following information from a pair of depth images:
+Camera extrinsics \(\mathcal{T} \in SE(3)\)
Camera projection functions:
+In which \(\pi(x)\) projects a 3D point in the camera’s coordinate system to the image coordinate, and \(\hat{\pi}^{-1}\) maps an image coordinate to a (unit-norm) ray in 3D that will project onto this image location.
+Discrete depth image \(\tilde{D} \in \mathbb{R}^{W \times H}\)
Find:
+(easy) Find the dense warp:
+so that 3D points that are perceived at location \(i_1\) in the discrete image \(\tilde{D}_1\) will be perceived at \(W^{1 \to 2}(i_1)\) in the continuous depth image \(D_2\).
+(hard) Find the dense confidence (occlusion):
+that the match is valid.
+The dense warp is easy, since TartanAir is a synthetic dataset and all geomrtric information is accurate. The problem is to determine is the pixel from the first image directly observed in the second image, or is it occluded by another object although projecting to the expected position.
+Existing methods uses depth error thresholding to determine the confidence, which is:
+For each point \(i_1 \in \mathcal{I}_1\):
+Project into 3D space with +\(x^{\{C_1\}}_{i_1} = \hat{\pi}_1^{-1}(i_1) \cdot \tilde{D}[i_1]\)
Transform to camera 2’s coordinate with +\(x^{\{C_2\}}_{i_1} = \mathcal{T}_{\{C_1\}}^{\{C_2\}} \cdot x^{\{C_1\}}_{i_1}\)
Compute the projected image coordinate \(i_2\) and expected depth \(d\)
+Get the depth value at \(i_2\) from \(\tilde{D}_2\), compute and threshold the error
++++\[e = |d - \tilde{D}_2[i_2]| , \quad \rho^{1 \to 2}[i_1] = P(e < \epsilon), W^{1 \to 2}(i_1) = i_2\]+The problem is, we only have discrete sample of \(D_2\): \(\tilde{D}_2\). We cannot get depth at fractional pixel location \(i_2\) from discrete \(\tilde{D}_2\).
+Therefore, we need to interpolate the depth map for a approximate expected depth. In this step, the expected depth may not be accurate and may lead to aliasing artifacts at large FOV change.
++\[\tilde{D}_2 \approx \text{intrep}(\tilde{D}_2, i_2), \quad \tilde{D}_2[i_2] \approx \tilde{D}_2[\text{nearest}(i_2)]\]+
The rightmost image shows the occlusion map with the above method. The aliasing artifacts are shown as the block dots at the far side of the corridor.
+With large FOV change, walls that are perpendicular in one view may become highly inclined in the other.
+ +As shown in the figure, points projected from C1’s camera model and depth map land in few pixels in C2’s image. The nearest approximation will lead to a large error in the depth value as shown by the difference between the red and black lines.
+We propose a fix to the above formulation by interpolating wisely:
+Use linear interpolation to get the depth value at fractional pixel location \(i_2\) from \(\tilde{D}_2\):
++++\[\tilde{D}_2[i_2] = \text{bilinear}(\tilde{D}_2, i_2)\]+At most cases, the depth value can be seen as continuous. The reason we do not use nearest interpolation at depth images is that depth can change rapidly, and we do not want to create non-existing depth at object edges. However, we are only using depth as verification, which means its effect is not propogated beyond occlusion calculation, and it is highly unlikely that the non-existing depth value will hit the reprojection since we use a very small threshold, minimizing the risk of doing so.
+
We allow a small error in the pixel space.
++++\[\rho^{1 \to 2}[i_1] = P\left( \min_{i \in B_{r_0}(i_2)}|d - \text{bilinear}(\tilde{D}_2, i)| < \epsilon\right)\]+In other words, we threshold the lower bound of the reprojection error in a small neighborhood of the projected pixel location. This helps to compensate homography effect in non-pinhole cameras and further reduce aliasing artifacts.
+
In conclusion, the full method is:
+ +With typical parameters:
+\(R_0 = 0.1\): Maximum search radius in pixels
\(n = 1\): Maximum number of iterations
\(\alpha = 0.2\): Step size for gradient descent
\(t_a = 0.04\): absolute depth error threshold
\(t_r = 0.005\): relative depth error threshold
\(\tau = 0.02\): temperature for error confidence
How necessary is the above method? We ablate:
+Nearest interpolation: We use nearest interpolation instead of bilinear interpolation.
Left - with bilinear interpolation. Mid - with nearest interpolation. Right - difference between the occlusion masks.
+In this extereme example we see linear interpolation avoids major aliasing artifacts.
+No optimization: We do not optimize for lower bound of reprojection error, and threshold the error directly.
Left - with bilinear interpolation. Mid - with nearest interpolation. Right - difference between the occlusion masks.
+Some occlusion will be missing due to the lack of optimization at highly distorted regions.
+Install:
+pip install tartanair
+
Welcome to TartanAir V2!
+Let’s go on an adventure to beautiful mountains, to dark caves, to stylish homes, to the Moon 🚀, and to other exciting places. And there is more! You, your models, and your robots can experience these worlds via a variety of sensors: LiDAR, IMU, optical cameras with any lense configuration you want (we provide customizable fisheye, pinhole, and equirectangular camera models), depth cameras, segmentation “cameras”, and event cameras.
+ +All the environments have recorded trajectories that were designed to be challenging and realistic. Can we improve the state of the art in SLAM, navigation, and robotics?
+Similar to TartanAir V1, the TartanAir V2 dataset is licensed under a Creative Commons Attribution 4.0 International License and the toolkit is licensed under a BSD-3-Clause License.
+To install the TartanAir Python package:
+pip install tartanair
+
TartanAir depends on PyTorch for some customization functionalities. Please install it, and Torchvision, on your system (we don’t do it for you to avoid version conflicts). Please follow the instructions on the PyTorch website: https://pytorch.org/get-started/locally/ . +That’s it! You’re ready to go.
+Currently the TartanAir Python package was only tested on Ubuntu 20.04. We will be testing on other operating systems in the future.
+Downloading does not work. It could be that you are missing wget. Get it using the following command:
+++++sudo apt install wget +
PyYAML fails to install with pip on MacOS. Please install it manually using the following command:
+++++python3 -m pip install pyyaml +
opencv-contrib-python fails to install with pip on MacOS. Please install it manually using the following command:
+++++python3 -m pip install opencv-contrib-python +This might take a while.
+
pytransform3d fails to install with pip on MacOS. Please install it manually using the following command:
+++++python3 -m pip install pytransform3d +
Pillow can cause trouble with older versions of torchvision. If you are facing issues with Pillow, like ImportError: cannot import name ‘PILLOW_VERSION’ from ‘PIL’, please install it manually using the following command, remove the torchvision package and install it again:
+++++python3 -m pip install Pillow +python3 -m pip uninstall torchvision +python3 -m pip install torchvision +
wget is not installed by default on MacOS. Please install it manually using the following command:
+++++brew install wget +
URLLib may not find your certificates on Mac, and you’ll see something like
+++++urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)> +Please run this in terminal to fix (adapt to your Python version):
+++++/Applications/Python\ 3.8/Install\ Certificates.command +
We placed 12 cameras in the environment, collecting raw data, including RGB images, depth images, semantic segmentation images, and camera poses. The 12 cameras all produce pinhole images with 90$^{circ}$ field of views. They are separated as 2 stereo sets of 6-cameras pointing in 6 directions to cover 360$^{circ}$ views (Figure~ref{fig:sensors}). The raw data are processed to generate other modalities including optical flow, fisheye image, LiDAR, and IMU.
+For each time step, we provide 12 RGB images from 12 cameras covering a stereo set of 360$^{circ}$ views. Each image is sampled using an FoV of 90$^{circ}$ and a resolution of 640 by 640. The stereo baseline is 0.25$m$. +In addition, We provide tools for adding random noise and motion blur to the images, to improve the realism.
+The depth images are sampled with the same camera intrinsics and extrinsics setups as the RGB images. It is perfectly aligned and synchronized with the RGB images. +Each pixel of the depth image is represented using a float32 number. Depending on the environment setup, the far pixels pointing to the sky usually return a very large value.
+We provide category-level semantic labels. We overcome the disadvantage of AirSim, which provides random semantic labels with respect to each type of the models in the environment, by manually labeling the model types for all the environments. So each label in the semantic segmentation images is mapped to a semantic class. With 65 highly distinct environments, our data covers a wide range ofwsnote{xxx} semantic classes. However, due to the consecutive format of the data, large objects such as building, ground and sky take much higher percentage. We provide statistics files for each environment, with which people can easily balance the data while training their semantic models.
+Camera pose are in the same format with V1. Each line of the pose file consists of 3 numbers of translation and 4 numbers of orientation in quarternion format, describing in NED frame.
+The LiDAR data is sampled from the raw depth images of 6 left-side cameras, following the pattern of Velodyn Puck (VLP-16). As a result, the LiDAR frame is perfectly aligned with the left camera frame. We didn’t use AirSim LiDAR sensor because it is based on the collision model, which misses a lot of objects that don’t have collision model, such as branches and leaves. While sampling from the depth, we balance carefully the accuracy and realism. We use linear interpolation except for the points along the edges of the objects, to avoid ghost points at object edges. We provide the processing script as well as other LiDAR models, such as Velodyn Ultra Puck (VLP-32C), allowing users to create their own LiDAR data. %wsnote{Yaoyu revise with other info?} +% LiDAR models, blend interpolation, etc
+The fisheye and Panorama data are sampled from the raw pinhole data, thus containing all three modalities of RGB, depth, and semantics. One of the biggest challenges for the fisheye model is that those real-world fisheye cameras have diverse FoVs and distortions. We have done two things to resolve the potential generalization issue. First, we define a standard model called Linear Spherical model for fisheye images. To test real-world fisheye data with different intrinsics and distortion on the model trained on TartanAir-V2 dataset, we just need to convert the real-world data into the Linear Spherical model. Second, we open-source our sampling code, together with a rich set of fisheye and pinhole camera models, which allow users to sample their own fisheye images (detailed in the nameref{sec:costomize} section).
+% Standard model (Linear Spherical), allow customization +% wsnote{Yaoyu, Yorai please check and revise} +% wsnote{add an illustration of linear spherical model? }
+% New support for fisheye +Same as V1, the optical flow is calculated for the static environments by image warping, using the camera pose and depth images. The biggest upgrades are that we accelerate the code by a Cuda implementation and provide tools for generating optical flow across any type of camera model (e.g. between pinhole and fisheye).
+The IMU ground truth data is generated by interpolating the camera pose, as a result, the IMU frame is perfectly aligned and synchronized with the left camera data. In specific, we double-differentiate the translation pose using a spline for the acceleration and differentiate the orientation using a spline for the angular rate. We provide the code for customizing the data generation (such as changing the frequency) as well as the code to add realistic noise.
+Following the same trajectories with other modalities, we recollect the front-facing camera data at 1000 Hz. We use the ESIM ~cite{Rebecq18corl,Gehrig_2020_CVPR} as that is one of the fastest event camera simulators available and close to SoTA performance. Though some of the new simulators like V2E and DVS Voltmeter produce events that are closer to real cameras, the models trained by ESIM have close performance when compared to the SoTA simulators~cite{lin2022dvs}. We sample the 640$times$640 RGB images at 1000 Hz and then generate the events using the simulator. To improve the generalization across various event cameras we used a wide range of contrast thresholds between 0.2 to 1.0. %Reducing the Sim-to-Real Gap for Event Cameras +Because the event data is collected separately from other modalities, some frames are inconsistent with other modalities in dynamic scenes.
+Occupancy grid maps are built while the data collection as will be detailed in the data collection pipeline section. The resolution of the map ranges from 0.125$m$ to 0.5$m$ depending on the size of the environment. The map can be used in evaluating the mapping algorithm.
+We definitely have bugs. Please report them in GitHub issues or submit a pull request!
+