diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
index e39f8042f..18c721f9d 100644
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@@ -36,7 +36,7 @@
}
diff --git a/docs/source/docker/advanced/code_examples/active_learning_worker_config.txt b/docs/source/docker/advanced/code_examples/active_learning_worker_config.txt
deleted file mode 100644
index 1f52ad826..000000000
--- a/docs/source/docker/advanced/code_examples/active_learning_worker_config.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-{
- enable_corruptness_check: true,
- remove_exact_duplicates: true,
- enable_training: false,
- pretagging: false,
- pretagging_debug: false,
- method: 'coral',
- stopping_condition: {
- n_samples: 0.1,
- min_distance: -1
- },
- scorer: 'object-frequency',
- scorer_config: {
- frequency_penalty: 0.25,
- min_score: 0.9
- },
- active_learning: {
- task_name: 'my-classification-task',
- score_name: 'uncertainty_margin'
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker/advanced/code_examples/load_model_from_checkpoint.py b/docs/source/docker/advanced/code_examples/load_model_from_checkpoint.py
deleted file mode 100644
index 93c907cfc..000000000
--- a/docs/source/docker/advanced/code_examples/load_model_from_checkpoint.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from collections import OrderedDict
-
-import torch
-
-import lightly
-
-
-def load_ckpt(ckpt_path, model_name="resnet-18", model_width=1, map_location="cpu"):
- ckpt = torch.load(ckpt_path, map_location=map_location)
-
- state_dict = OrderedDict()
- for key, value in ckpt["state_dict"].items():
- if ("projection_head" in key) or ("backbone.7" in key):
- # drop layers used for projection head
- continue
- state_dict[key.replace("model.backbone.", "")] = value
-
- resnet = lightly.models.ResNetGenerator(name=model_name, width=model_width)
- model = torch.nn.Sequential(
- lightly.models.batchnorm.get_norm_layer(3, 0),
- *list(resnet.children())[:-1],
- torch.nn.AdaptiveAvgPool2d(1),
- torch.nn.Flatten(1),
- )
- try:
- model.load_state_dict(state_dict)
- except RuntimeError:
- raise RuntimeError(
- f"It looks like you tried loading a checkpoint from a model that is not a {model_name} with width={model_width}! "
- f"Please set model_name and model_width to the lightly.model.name and lightly.model.width parameters from the "
- f"configuration you used to run Lightly. The configuration from a Lightly worker run can be found in output_dir/config/config.yaml"
- )
- return model
-
-
-# loading the model
-model = load_ckpt("output_dir/lightly_epoch_X.ckpt")
-
-
-# example usage
-image_batch = torch.rand(16, 3, 224, 224)
-out = model(image_batch)
-print(out.shape) # prints: torch.Size([16, 512])
-
-
-# creating a classifier from the pre-trained model
-num_classes = 10
-classifier = torch.nn.Sequential(
- model, torch.nn.Linear(512, num_classes) # use 2048 instead of 512 for resnet-50
-)
-
-out = classifier(image_batch)
-print(out.shape) # prints: torch.Size(16, 10)
diff --git a/docs/source/docker/advanced/code_examples/object_level_worker_config.txt b/docs/source/docker/advanced/code_examples/object_level_worker_config.txt
deleted file mode 100644
index a4740f931..000000000
--- a/docs/source/docker/advanced/code_examples/object_level_worker_config.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-{
- object_level: {
- task_name: 'vehicles_object_detections'
- },
- enable_corruptness_check: true,
- remove_exact_duplicates: true,
- enable_training: false,
- pretagging: false,
- pretagging_debug: false,
- method: 'coreset',
- stopping_condition: {
- n_samples: 0.1,
- min_distance: -1
- },
- scorer: 'object-frequency',
- scorer_config: {
- frequency_penalty: 0.25,
- min_score: 0.9
- },
- active_learning: {
- task_name: '',
- score_name: 'uncertainty_margin'
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker/advanced/code_examples/object_level_worker_config_pretagging.txt b/docs/source/docker/advanced/code_examples/object_level_worker_config_pretagging.txt
deleted file mode 100644
index 39ce861cf..000000000
--- a/docs/source/docker/advanced/code_examples/object_level_worker_config_pretagging.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-{
- object_level: {
- task_name: 'lightly_pretagging'
- },
- enable_corruptness_check: true,
- remove_exact_duplicates: true,
- enable_training: false,
- pretagging: true,
- pretagging_debug: false,
- method: 'coreset',
- stopping_condition: {
- n_samples: 0.1,
- min_distance: -1
- },
- scorer: 'object-frequency',
- scorer_config: {
- frequency_penalty: 0.25,
- min_score: 0.9
- },
- active_learning: {
- task_name: '',
- score_name: 'uncertainty_margin'
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker/advanced/code_examples/python_create_dataset_azure_example.py b/docs/source/docker/advanced/code_examples/python_create_dataset_azure_example.py
deleted file mode 100644
index 4badd92a2..000000000
--- a/docs/source/docker/advanced/code_examples/python_create_dataset_azure_example.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import json
-
-import lightly
-from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
-from lightly.openapi_generated.swagger_client.models.datasource_purpose import (
- DatasourcePurpose,
-)
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="YOUR_TOKEN")
-
-# Create a new dataset on the Lightly Platform.
-client.create_dataset("pedestrian-videos-datapool", dataset_type=DatasetType.VIDEOS)
-
-# Azure Blob Storage
-# Input bucket
-client.set_azure_config(
- container_name="my-container/input/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_azure_config(
- container_name="my-container/output/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.LIGHTLY,
-)
diff --git a/docs/source/docker/advanced/code_examples/python_create_dataset_gcs_example.py b/docs/source/docker/advanced/code_examples/python_create_dataset_gcs_example.py
deleted file mode 100644
index f8d86cc56..000000000
--- a/docs/source/docker/advanced/code_examples/python_create_dataset_gcs_example.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import json
-
-import lightly
-from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
-from lightly.openapi_generated.swagger_client.models.datasource_purpose import (
- DatasourcePurpose,
-)
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="YOUR_TOKEN")
-
-# Create a new dataset on the Lightly Platform.
-client.create_dataset("pedestrian-videos-datapool", dataset_type=DatasetType.VIDEOS)
-
-# Google Cloud Storage
-# Input bucket
-client.set_gcs_config(
- resource_path="gs://bucket/input/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_read.json"))),
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_gcs_config(
- resource_path="gs://bucket/output/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_write.json"))),
- purpose=DatasourcePurpose.LIGHTLY,
-)
diff --git a/docs/source/docker/advanced/code_examples/python_create_dataset_s3_example.py b/docs/source/docker/advanced/code_examples/python_create_dataset_s3_example.py
deleted file mode 100644
index 8dea5a7b0..000000000
--- a/docs/source/docker/advanced/code_examples/python_create_dataset_s3_example.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import json
-
-import lightly
-from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
-from lightly.openapi_generated.swagger_client.models.datasource_purpose import (
- DatasourcePurpose,
-)
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="YOUR_TOKEN")
-
-# Create a new dataset on the Lightly Platform.
-client.create_dataset("pedestrian-videos-datapool", dataset_type=DatasetType.VIDEOS)
-
-# AWS S3
-# Input bucket
-client.set_s3_config(
- resource_path="s3://bucket/input/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_s3_config(
- resource_path="s3://bucket/output/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.LIGHTLY,
-)
diff --git a/docs/source/docker/advanced/code_examples/python_create_frame_predictions.py b/docs/source/docker/advanced/code_examples/python_create_frame_predictions.py
deleted file mode 100644
index d71133616..000000000
--- a/docs/source/docker/advanced/code_examples/python_create_frame_predictions.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import json
-from pathlib import Path
-from typing import Dict, List
-
-import av
-
-dataset_dir = Path("/datasets/my_dataset")
-predictions_dir = dataset_dir / ".lightly" / "predictions" / "my_prediction_task"
-
-
-def model_predict(frame) -> List[Dict]:
- # This function must be overwritten to generate predictions for a frame using
- # a prediction model of your choice. Here we just return an example prediction.
- # See https://docs.lightly.ai/docker/advanced/datasource_predictions.html#prediction-format
- # for possible prediction formats.
- return [{"category_id": 0, "bbox": [0, 10, 100, 30], "score": 0.8}]
-
-
-for video_path in dataset_dir.glob("**/*.mp4"):
- # get predictions for frames
- predictions = []
- with av.open(str(video_path)) as container:
- stream = container.streams.video[0]
- for frame in container.decode(stream):
- predictions.append(model_predict(frame.to_image()))
-
- # save predictions
- num_frames = len(predictions)
- zero_padding = len(str(num_frames))
- for frame_index, frame_predictions in enumerate(predictions):
- video_name = video_path.relative_to(dataset_dir).with_suffix("")
- frame_name = Path(
- f"{video_name}-{frame_index:0{zero_padding}}-{video_path.suffix[1:]}.png"
- )
- prediction = {
- "file_name": str(frame_name),
- "predictions": frame_predictions,
- }
- out_path = predictions_dir / frame_name.with_suffix(".json")
- out_path.parent.mkdir(parents=True, exist_ok=True)
- with open(out_path, "w") as file:
- json.dump(prediction, file)
-
-
-# example directory structure before
-# .
-# ├── test
-# │ └── video_0.mp4
-# └── train
-# ├── video_1.mp4
-# └── video_2.mp4
-#
-# example directory structure after
-# .
-# ├── .lightly
-# │ └── predictions
-# │ └── my_prediction_task
-# │ ├── test
-# │ │ ├── video_0-000-mp4.json
-# │ │ ├── video_0-001-mp4.json
-# │ │ ├── video_0-002-mp4.json
-# │ │ └── ...
-# │ └── train
-# │ ├── video_1-000-mp4.json
-# │ ├── video_1-001-mp4.json
-# │ ├── video_1-002-mp4.json
-# | ├── ...
-# | ├── video_2-000-mp4.json
-# | ├── video_2-001-mp4.json
-# | ├── video_2-002-mp4.json
-# │ └── ...
-# ├── test
-# │ └── video_0.mp4
-# └── train
-# ├── video_1.mp4
-# └── video_2.mp4
diff --git a/docs/source/docker/advanced/code_examples/python_run_datapool_example.py b/docs/source/docker/advanced/code_examples/python_run_datapool_example.py
deleted file mode 100644
index 669c42ba3..000000000
--- a/docs/source/docker/advanced/code_examples/python_run_datapool_example.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import lightly
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="YOUR_TOKEN")
-
-# Let's fetch the dataset we created above, by name
-client.set_dataset_id_by_name("pedestrian-videos-datapool")
-
-# Schedule the compute run using our custom config.
-# We show here the full default config so you can easily edit the
-# values according to your needs.
-client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False,
- },
- selection_config={
- "n_samples": 100,
- "strategies": [
- {
- "input": {"type": "EMBEDDINGS"},
- "strategy": {
- "type": "DIVERSITY",
- "stopping_condition_minimum_distance": 0.1,
- },
- }
- ],
- },
- lightly_config={
- "loader": {
- "batch_size": 128,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 1, "precision": 16},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.0,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker/advanced/code_examples/python_run_datapool_example_2.py b/docs/source/docker/advanced/code_examples/python_run_datapool_example_2.py
deleted file mode 100644
index d625a3e15..000000000
--- a/docs/source/docker/advanced/code_examples/python_run_datapool_example_2.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import lightly
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="YOUR_TOKEN")
-
-# Let's fetch the dataset we created above, by name
-client.set_dataset_id_by_name("pedestrian-videos-datapool")
-
-# Schedule the compute run using our custom config.
-# We show here the full default config so you can easily edit the
-# values according to your needs.
-client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False,
- },
- selection_config={
- "n_samples": 100,
- "strategies": [
- {
- "input": {"type": "EMBEDDINGS"},
- "strategy": {
- "type": "DIVERSITY",
- "stopping_condition_minimum_distance": 0.2,
- },
- }
- ],
- },
- lightly_config={
- "loader": {
- "batch_size": 128,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 1, "precision": 16},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.0,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker/advanced/code_examples/python_run_object_level.py b/docs/source/docker/advanced/code_examples/python_run_object_level.py
deleted file mode 100644
index 85239ae2b..000000000
--- a/docs/source/docker/advanced/code_examples/python_run_object_level.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import json
-
-import lightly
-from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
-from lightly.openapi_generated.swagger_client.models.datasource_purpose import (
- DatasourcePurpose,
-)
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="YOUR_TOKEN")
-
-# Create a new dataset on the Lightly Platform.
-client.create_dataset("dataset-name", dataset_type=DatasetType.IMAGES)
-
-# Pick one of the following three blocks depending on where your data is
-# AWS S3
-# Input bucket
-client.set_s3_config(
- resource_path="s3://bucket/input/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_s3_config(
- resource_path="s3://bucket/output/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-
-# or Google Cloud Storage
-# Input bucket
-client.set_gcs_config(
- resource_path="gs://bucket/input/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_read.json"))),
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_gcs_config(
- resource_path="gs://bucket/output/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_write.json"))),
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-
-# or Azure Blob Storage
-# Input bucket
-client.set_azure_config(
- container_name="my-container/input/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_azure_config(
- container_name="my-container/output/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-# Schedule the docker run with the "object_level.task_name" argument set.
-# All other settings are default values and we show them so you can easily edit
-# the values according to your need.
-client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False,
- "object_level": { # used for object level workflow
- "task_name": "vehicles_object_detections"
- },
- },
- selection_config={
- "n_samples": 100,
- "strategies": [
- {
- "input": {"type": "EMBEDDINGS"},
- "strategy": {
- "type": "DIVERSITY",
- },
- },
- # Optionally, you can combine diversity selection with active learning
- # to prefer selecting objects the model struggles with.
- # If you want that, just include the following code:
- """
- {
- "input": {
- "type": "SCORES",
- "task": "vehicles_object_detections", # change to your task
- "score": "uncertainty_entropy" # change to your preferred score
- },
- "strategy": {
- "type": "WEIGHTS"
- }
- }
- """,
- ],
- },
- lightly_config={
- "loader": {
- "batch_size": 16,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 100, "precision": 32},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.5,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker/advanced/code_examples/python_run_object_level_pretagging.py b/docs/source/docker/advanced/code_examples/python_run_object_level_pretagging.py
deleted file mode 100644
index 8dd55de20..000000000
--- a/docs/source/docker/advanced/code_examples/python_run_object_level_pretagging.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import json
-
-import lightly
-from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
-from lightly.openapi_generated.swagger_client.models.datasource_purpose import (
- DatasourcePurpose,
-)
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="YOUR_TOKEN")
-
-# Create a new dataset on the Lightly Platform.
-client.create_dataset("dataset-name", dataset_type=DatasetType.IMAGES)
-
-
-# Pick one of the following three blocks depending on where your data is
-# AWS S3
-# Input bucket
-client.set_s3_config(
- resource_path="s3://bucket/input/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_s3_config(
- resource_path="s3://bucket/output/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-
-# or Google Cloud Storage
-# Input bucket
-client.set_gcs_config(
- resource_path="gs://bucket/input/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_read.json"))),
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_gcs_config(
- resource_path="gs://bucket/output/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_write.json"))),
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-
-# or Azure Blob Storage
-# Input bucket
-client.set_azure_config(
- container_name="my-container/input/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_azure_config(
- container_name="my-container/output/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-# Schedule the docker run with the "object_level.task_name" argument set to
-# "lightly_pretagging" and with "pretagging" set to True.
-# All other settings are default values and we show them so you can easily edit
-# the values according to your need.
-client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": True,
- "pretagging_debug": False,
- "object_level": {"task_name": "lightly_pretagging"},
- },
- selection_config={
- "n_samples": 100,
- "strategies": [
- {
- "input": {"type": "EMBEDDINGS"},
- "strategy": {
- "type": "DIVERSITY",
- },
- }
- ],
- },
- lightly_config={
- "loader": {
- "batch_size": 16,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 100, "precision": 32},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.5,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker/advanced/code_examples/python_run_pretagging.py b/docs/source/docker/advanced/code_examples/python_run_pretagging.py
deleted file mode 100644
index fe7dedd7b..000000000
--- a/docs/source/docker/advanced/code_examples/python_run_pretagging.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import json
-
-import lightly
-from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
-from lightly.openapi_generated.swagger_client.models.datasource_purpose import (
- DatasourcePurpose,
-)
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="YOUR_TOKEN")
-
-# Create a new dataset on the Lightly Platform. In this example we use pretagging
-# on images. We can also use videos instead by setting dataset_type=DatasetType.VIDEOS
-client.create_dataset("your-dataset-name", dataset_type=DatasetType.IMAGES)
-
-# Pick one of the following three blocks depending on where your data is
-# AWS S3
-# Input bucket
-client.set_s3_config(
- resource_path="s3://bucket/input/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_s3_config(
- resource_path="s3://bucket/output/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-
-# or Google Cloud Storage
-# Input bucket
-client.set_gcs_config(
- resource_path="gs://bucket/input/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_read.json"))),
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_gcs_config(
- resource_path="gs://bucket/output/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_write.json"))),
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-
-# or Azure Blob Storage
-# Input bucket
-client.set_azure_config(
- container_name="my-container/input/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_azure_config(
- container_name="my-container/output/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-# Schedule the compute run using our custom config.
-# We show here the full default config so you can easily edit the
-# values according to your needs.
-client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": True, # to enable pretagging
- "pretagging_debug": True, # we also want debugging images in the report
- },
- selection_config={
- "n_samples": 100,
- "strategies": [
- {
- "input": {"type": "EMBEDDINGS"},
- "strategy": {
- "type": "DIVERSITY",
- },
- }
- ],
- },
- lightly_config={
- "loader": {
- "batch_size": 128,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 1, "precision": 16},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.0,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker/advanced/code_examples/python_run_sequence_selection.py b/docs/source/docker/advanced/code_examples/python_run_sequence_selection.py
deleted file mode 100644
index dfc405793..000000000
--- a/docs/source/docker/advanced/code_examples/python_run_sequence_selection.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import json
-
-import lightly
-from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
-from lightly.openapi_generated.swagger_client.models.datasource_purpose import (
- DatasourcePurpose,
-)
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="YOUR_TOKEN")
-
-# Create a new dataset on the Lightly Platform.
-client.create_dataset("pexels", dataset_type=DatasetType.VIDEOS)
-
-# Pick one of the following three blocks depending on where your data is
-# AWS S3
-# Input bucket
-client.set_s3_config(
- resource_path="s3://bucket/input/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_s3_config(
- resource_path="s3://bucket/output/",
- region="eu-central-1",
- access_key="S3-ACCESS-KEY",
- secret_access_key="S3-SECRET-ACCESS-KEY",
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-
-# or Google Cloud Storage
-# Input bucket
-client.set_gcs_config(
- resource_path="gs://bucket/input/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_read.json"))),
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_gcs_config(
- resource_path="gs://bucket/output/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials_write.json"))),
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-
-# or Azure Blob Storage
-# Input bucket
-client.set_azure_config(
- container_name="my-container/input/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.INPUT,
-)
-# Output bucket
-client.set_azure_config(
- container_name="my-container/output/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- purpose=DatasourcePurpose.LIGHTLY,
-)
-
-# Schedule the compute run using our custom config.
-# We show here the full default config so you can easily edit the
-# values according to your needs.
-client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": False,
- "remove_exact_duplicates": False,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False,
- "method": "coreset",
- "stopping_condition": {
- "n_samples": 200, # select 200 frames of length 10 frames -> 20 sequences
- "min_distance": -1,
- },
- "selected_sequence_length": 10, # we want sequences of 10 frames lenght
- },
- lightly_config={
- "loader": {
- "batch_size": 128,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 1, "precision": 16},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.0,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker/advanced/code_examples/semantic_segmentation_inference.py b/docs/source/docker/advanced/code_examples/semantic_segmentation_inference.py
deleted file mode 100644
index 62a4f379f..000000000
--- a/docs/source/docker/advanced/code_examples/semantic_segmentation_inference.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import json
-import os
-
-import numpy as np
-
-TASK_NAME = "lightly_semantic_segmentation"
-CATEGORIES = ["background", "car", "person"]
-
-
-def get_dummy_prediction(height: int = 500, width: int = 500):
- """Returns a dummy prediction of shape h x w x n_classes.
-
- Height and width are in pixels.
- """
- return np.random.rand(height, width, len(CATEGORIES))
-
-
-def filename_to_json(filename: str):
- """Turns an image filename into the respective json filename."""
- root, _ = os.path.splitext(filename)
- return f"{root}.json"
-
-
-def binary_to_rle(binary_mask: np.ndarray) -> np.ndarray:
- """Converts a binary segmentation mask to RLE."""
- # Flatten mask and add -1 at beginning and end of array
- flat = np.concatenate(([-1], np.ravel(binary_mask), [-1]))
- # Find indices where a change to 0 or 1 happens
- borders = np.nonzero(np.diff(flat))[0]
- # Find counts of subsequent 0s and 1s
- rle = np.diff(borders)
- if flat[1]:
- # The first value in the encoding must always be the count
- # of initial 0s. If the mask starts with a 1 we must set
- # this count to 0.
- rle = np.concatenate(([0], rle))
- return rle
-
-
-def convert_to_lightly_prediction(filename: str, seg_map: np.ndarray):
- """Converts a segmentation map of shape W x H x C to Lightly format."""
- seg_map_argmax = np.argmax(seg_map, axis=-1)
-
- prediction = {"file_name": filename, "predictions": []}
- for category_id in np.unique(seg_map_argmax):
- rle = binary_to_rle(seg_map_argmax == category_id)
- logits = np.mean(seg_map[seg_map_argmax == category_id], axis=0)
- assert np.argmax(logits) == category_id
- probabilities = np.exp(logits) / np.sum(np.exp(logits))
- assert abs(np.sum(probabilities) - 1.0) < 1e-6
-
- prediction["predictions"].append(
- {
- "category_id": int(category_id),
- "segmentation": [int(r) for r in rle],
- "score": float(probabilities[category_id]),
- "probabilities": [float(p) for p in probabilities],
- }
- )
-
- return prediction
-
-
-# The following code will generate a tasks.json, a schema.json, and a dummy
-# prediction file called my_image.json. To use them with the Lightly worker,
-# arrange them as follows in a .lightly directory
-#
-# .lightly/
-# L predictions/
-# L tasks.json
-# L lightly_semantic_segmentation/
-# L schema.json
-# L // add the real prediction files here
-#
-
-
-# add tasks.json
-tasks = [TASK_NAME]
-with open("tasks.json", "w") as f:
- json.dump(tasks, f)
-
-# add schema.json
-schema = {
- "task_type": "semantic-segmentation",
- "categories": [
- {
- "id": i,
- "name": name,
- }
- for i, name in enumerate(CATEGORIES)
- ],
-}
-with open("schema.json", "w") as f:
- json.dump(schema, f)
-
-# generate a dummy prediction
-filename = "my_image.png"
-prediction = get_dummy_prediction() # this is a h x w x n_classes numpy array
-category_ids = np.argmax(prediction, axis=-1)
-
-lightly_prediction = {"file_name": filename, "predictions": []}
-for category_id in np.unique(category_ids):
- # get the run-length encoding
- rle = binary_to_rle(category_ids == category_id)
- # get the logits
- logits = np.mean(prediction[category_ids == category_id], axis=0)
- assert np.argmax(logits) == category_id
- probabilities = np.exp(logits) / np.sum(np.exp(logits))
- assert abs(np.sum(probabilities) - 1.0) < 1e-6
-
- lightly_prediction["predictions"].append(
- {
- "category_id": int(category_id),
- "segmentation": [int(r) for r in rle],
- "score": float(probabilities[category_id]),
- "probabilities": [float(p) for p in probabilities],
- }
- )
-
-with open(filename_to_json(filename), "w") as f:
- json.dump(lightly_prediction, f)
diff --git a/docs/source/docker/advanced/datapool.rst b/docs/source/docker/advanced/datapool.rst
deleted file mode 100644
index 389af3336..000000000
--- a/docs/source/docker/advanced/datapool.rst
+++ /dev/null
@@ -1,139 +0,0 @@
-.. _datapool:
-
-Datapool
-=================
-
-Lightly Worker has been designed in a way that you can incrementally build up a
-dataset for your project. The software automatically keeps track of the
-representations of previously selected samples and uses this information
-to pick new samples in order to maximize the quality of the final dataset.
-It also allows for combining two different datasets into one.
-
-For example, let's imagine we have a dataset of street videos. After running
-the Lightly Worker once we added 4 more street videos to the bucket.
-The new raw data might include samples which should be added to your dataset
-in the Lightly Platform, so you want to add a subset of them to your dataset.
-
-This workflow is supported by the Lightly Platform using a datapool.
-It remembers which raw data in your bucket has already been processed
-and will ignore it in future Lightly Worker runs.
-
-Thus you can run the Lightly Worker with the same command again. It will find
-your new raw data in the bucket, stream, embed and subsample it and then add it to
-your existing dataset. The selection strategy will take the existing data in your dataset
-into account when selecting new data to be added to your dataset.
-
-.. image:: ./images/webapp-embedding-after-2nd-docker.png
-
-After the Lightly Worker run we can go to the embedding view of the Lightly Platform
-to see the newly added samples there in a new tag. We see that the new samples
-(in green) fill some gaps left by the images in the first iteration (in grey).
-However, there are still some gaps left, which could be filled by adding more videos
-to the bucket and running the Lightly Worker again.
-
-This workflow of iteratively growing your dataset with the Lightly Worker
-has the following advantages:
-
-- You can learn from your findings after each iteration
- to know which raw data you need to collect next.
-- Only your new data is processed, saving you time and compute cost.
-- You don't need to configure anything, just run the same command again.
-- Only samples which are different to the existing ones are added to the dataset.
-
-If you want to search all data in your bucket for new samples
-instead of only newly added data,
-then set :code:`'datasource.process_all': True` in your worker config. This has the
-same effect as creating a new dataset and running the Lightly Worker from scratch
-on the full dataset. We process all data instead of only the newly added ones.
-
-
-Example
----------------
-
-In this example we will do the following steps:
-
-#. Schedule a run to process a cloud bucket with 3 videos
-#. Add 2 more videos to the same bucket
-#. Run the Lightly Worker with the same config again to use the datapool feature
-
-
-Here we show the content of the bucket before running the Lightly Worker for the
-first time.
-
-.. code-block:: console
-
- videos/
- |-- campus4-c0.avi
- |-- passageway1-c1.avi
- `-- terrace1-c0.avi
-
-Let's create a dataset which uses that bucket (choose your tab - S3, GCS or Azure):
-
-.. tabs::
- .. tab:: AWS S3 Datasource
-
- .. literalinclude:: ./code_examples/python_create_dataset_s3_example.py
- :linenos:
- :language: python
-
- .. tab:: GCS Datasource
-
- .. literalinclude:: ./code_examples/python_create_dataset_gcs_example.py
- :linenos:
- :language: python
-
- .. tab:: Azure Datasource
-
- .. literalinclude:: ./code_examples/python_create_dataset_azure_example.py
- :linenos:
- :language: python
-
-Now we can run the following code to select a subset based on the
-:code:`'stopping_condition_minimum_distance': 0.1` stopping condition. In a first,
-selection run we only select images with the specific minimum distance between
-each other based on the embeddings.
-
-.. literalinclude:: ./code_examples/python_run_datapool_example.py
- :linenos:
- :language: python
- :emphasize-lines: 8, 30
-
-After running the code we have to make sure we have a running Lightly Worker
-to process the job.
-We can start the Lightly Worker using the following command
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --rm --gpus all -it \
- lightly/worker:latest \
- token=YOUR_TOKEN worker.worker_id=YOUR_WORKER_ID
-
-After we have processed the initial data and created a dataset,
-we've collected more data and our bucket now looks like this:
-
-.. code-block:: console
-
- videos/
- |-- campus4-c0.avi
- |-- campus7-c0.avi
- |-- passageway1-c1.avi
- |-- terrace1-c0.avi
- `-- terrace1-c3.avi
-
-We can run the same script again (it won't create a new dataset but use the
-existing one based on the dataset name). Let's increase the
-stopping_condition_minimum_distance to 0.2:
-
-.. literalinclude:: ./code_examples/python_run_datapool_example_2.py
- :linenos:
- :language: python
- :emphasize-lines: 30
-
-
-How It Works
----------------
-
-The Lightly Datapool keeps track of the selected samples in a csv file called
-`datapool_latest.csv`. It contains the filenames of the selected images and their
-embeddings. This feature is currently only supported without training of a custom
-model. Please make sure :code:`'enable_training': False` is set in your worker config.
diff --git a/docs/source/docker/advanced/datasource_metadata.rst b/docs/source/docker/advanced/datasource_metadata.rst
deleted file mode 100644
index da7288393..000000000
--- a/docs/source/docker/advanced/datasource_metadata.rst
+++ /dev/null
@@ -1,240 +0,0 @@
-.. _docker-datasource-metadata:
-
-Add Metadata to a Datasource
-===============================
-
-Lightly Worker can make use of metadata collected alongside your images or videos. Provided,
-metadata can be used to steer the selection process and to analyze the selected dataset
-in the Lightly Platform.
-
-
-Metadata Folder Structure
-----------------------------
-
-Following, we outline the format in which metadata can be added to a
-Lightly datasource. Everything regarding metadata will take place in a subdirectory
-of your configured **lightly datasource** called :code:`.lightly/metadata`. The general structure
-your input and lightly bucket will look like this:
-
-.. code-block:: bash
-
- input/bucket/
- + image_0.png
- + subdir/
- + image_1.png
- + image_2.png
- + ...
- + image_N.png
-
- lightly/bucket/
- + .lightly/metadata
- + schema.json
- + image_0.json
- + subdir/
- + image_1.json
- ...
- + image_N.json
-
-
-All of the `.json` files are explained in the next sections.
-
-
-
-Metadata Schema
----------------
-The schema defines the format of the metadata and helps the Lightly Platform to correctly identify
-and display different types of metadata.
-
-You can provide this information to Lightly Worker by adding a `schema.json` to the
-`.lightly/metadata` directory. The `schema.json` file must contain a list of
-configuration entries. Each of the entries is a dictionary with the following keys:
-
- - `name`: Identifier of the metadata in the UI.
- - `path`: Concatenation of the keys to access the metadata in a dictionary.
- - `defaultValue`: The fallback value if there is no metadata available.
- - `valueDataType`: One of
-
- - `NUMERIC_INT`
- - `NUMERIC_FLOAT`
- - `CATEGORICAL_INT`
- - `CATEGORICAL_STRING`
- - `CATEGORICAL_BOOLEAN`
-
-
-For example, let's say we have additional information about the scene and weather for each
-of the images we have collected. A possible schema could look like this:
-
-.. code-block:: javascript
- :caption: .lightly/metadata/schema.json
-
- [
- {
- "name": "Scene",
- "path": "scene",
- "defaultValue": "undefined",
- "valueDataType": "CATEGORICAL_STRING"
- }
- {
- "name": "Weather description",
- "path": "weather.description",
- "defaultValue": "nothing",
- "valueDataType": "CATEGORICAL_STRING"
- },
- {
- "name": "Temperature",
- "path": "weather.temperature",
- "defaultValue": 0.0,
- "valueDataType": "NUMERIC_FLOAT"
- },
- {
- "name": "Air pressure",
- "path": "weather.air_pressure",
- "defaultValue": 0,
- "valueDataType": "NUMERIC_INT"
- },
- {
- "name": "Vehicle ID",
- "path": "vehicle_id",
- "defaultValue": 0,
- "valueDataType": "CATEGORICAL_INT"
- }
- ]
-
-
-
-
-Metadata Files
---------------
-Lightly Worker requires a single metadata file per image or video. If an image or video has no corresponding metadata file,
-Lightly Worker assumes the default value from the `schema.json`. If a metadata file is provided for a full video,
-Lightly Worker assumes that the metadata is valid for all frames in that video.
-
-To provide metadata for an image or a video, place a metadata file with the same name
-as the image or video in the `.lightly/metadata` directory but change the file extension to
-`.json`. The file should contain the metadata in the format defined under :ref:`metadata-format`.
-
-
-.. code-block:: bash
-
- # filename of the metadata for file input/bucket/FILENAME.EXT
- .lightly/metadata/${FILENAME}.json
-
- # example: input/bucket/subdir/image_1.png
- .lightly/metadata/subdir/image_1.json
-
- # example: input/bucket/image_0.png
- .lightly/metadata/image_0.json
-
- # example: input/bucket/subdir/video_1.mp4
- .lightly/metadata/subdir/video_1.json
-
-
-When working with videos it's also possible to provide metadata on a per-frame basis.
-Then, Lightly Worker requires a metadata file per frame. If a frame has no corresponding metadata file,
-Lightly Worker assumes the default value from the `schema.json`. Lightly Worker uses a naming convention to
-identify frames: The filename of a frame consists of the video filename, the frame number
-(padded to the length of the number of frames in the video), the video format separated
-by hyphens. For example, for a video with 200 frames, the frame number will be padded
-to length three. For a video with 1000 frames, the frame number will be padded to length four (99 becomes 0099).
-
-
-.. code-block:: bash
-
- # filename of the metadata of the Xth frame of video input/bucket/FILENAME.EXT
- # with 200 frames (padding: len(str(200)) = 3)
- .lightly/metadata/${FILENAME}-${X:03d}-${EXT}.json
-
- # example: input/bucket/subdir/video_1.mp4, frame 99/200
- .lightly/metadata/subdir/video_1-099-mp4.json
-
- # example: input/bucket/video_0.mp4, frame 99/200
- .lightly/metadata/video_0-099-mp4.json
-
-
-.. _metadata-format:
-
-Metadata Format
----------------
-
-The metadata json files for images and videos require the keys `file_name`, `type`, and `metadata` key.
-Here, `file_name` serves as a unique identifier to retrieve the original file for which the metadata was collected,
-`type` indicates whether the metadata is per "video", "frame", or "image", and `metadata` contains the actual metadata.
-
-For our example from above, a metadata file corresponding to a image/video/frame should look like this:
-
-
-.. tabs::
-
-
- .. tab:: Video
-
- .. code-block:: javascript
- :caption: .lightly/metadata/subdir/video_1.json
-
- {
- "file_name": "subdir/video_1.mp4",
- "type": "video",
- "metadata": {
- "scene": "city street",
- "weather": {
- "description": "sunny",
- "temperature": 23.2,
- "air_pressure": 1
- },
- "vehicle_id": 321,
- }
- }
-
- .. tab:: Frame
-
- .. code-block:: javascript
- :caption: .lightly/metadata/subdir/video_1-099-mp4.json
-
- {
- "file_name": "subdir/video_1-099-mp4.png",
- "type": "frame",
- "metadata": {
- "scene": "city street",
- "weather": {
- "description": "sunny",
- "temperature": 23.2,
- "air_pressure": 1
- },
- "vehicle_id": 321,
- }
- }
-
- .. tab:: Image
-
- .. code-block:: javascript
- :caption: .lightly/metadata/subdir/image_1.json
-
- {
- "file_name": "subdir/image_1.png",
- "type": "image",
- "metadata": {
- "scene": "highway",
- "weather": {
- "description": "rainy",
- "temperature": 10.5,
- "air_pressure": 1
- },
- "vehicle_id": 321,
- }
- }
-
-
-
-
-Next Steps
-----------
-
-If metadata is provided, the Lightly Worker will automatically detect and load it into
-the Lightly Platform where it can be visualized and analyzed after running a selection.
-
-For example, it's possible to visualize the different categories of metadata in the Lightly
-Platform scatter plot. In the following example we visualized the categorical metadata "Scene"
-from the BDD100k dataset.
-
-
-.. figure:: images/bdd100k_demo_metadata.jpg
diff --git a/docs/source/docker/advanced/datasource_predictions.rst b/docs/source/docker/advanced/datasource_predictions.rst
deleted file mode 100644
index 584bc4d93..000000000
--- a/docs/source/docker/advanced/datasource_predictions.rst
+++ /dev/null
@@ -1,559 +0,0 @@
-.. _docker-datasource-predictions:
-
-Add Predictions to a Datasource
-===============================
-
-Lightly Worker can not only use images you provided in a datasource, but also predictions of a ML model on your images.
-They are used for active learning for selecting images based on the objects in them.
-Furthermore, object detection predictions can be used running Lightly Worker on object level.
-By providing the predictions in the datasource,
-you have full control over them and they scale well to millions of samples.
-Furthermore, if you add new samples to your datasource, you can simultaneously
-add their predictions to the datasource.
-If you already have labels instead of predictions, you can treat them
-just as predictions and upload them the same way.
-
-.. note:: Note that working with predictions requires a minimum
- Lightly Worker of version 2.2. You can check your installed version of the
- Lightly Worker by running the :ref:`docker-setup-sanity-check`.
-
-Predictions Folder Structure
-----------------------------
-
-In the following, we will outline the format of the predictions required by the
-Lightly Worker. Everything regarding predictions will take place in a subdirectory
-of your configured **lightly datasource** called :code:`.lightly/predictions`. The general structure
-of your input and lightly bucket will look like this:
-
-
-.. code-block:: bash
-
- input/bucket/
- + image_0.png
- + subdir/
- + image_1.png
- + image_2.png
- + ...
- + image_N.png
-
- lightly/bucket/
- + .lightly/predictions/
- + tasks.json
- + task_1/
- + schema.json
- + image_0.json
- + subdir/
- + image_1.json
- ...
- + image_N.json
- + task_2/
- + schema.json
- + image_0.json
- + subdir/
- + image_1.json
- ...
- + image_N.json
-
-
-
-Where each subdirectory corresponds to one prediction task (e.g. a classification task
-and an object detection task). All of the files are explained in the next sections.
-
-
-Prediction Tasks
-----------------
-To let the Lightly Worker know what kind of prediction tasks you want to work with, the Lightly Worker
-needs to know their names. It's very easy to let the Lightly Worker know which tasks exist:
-simply add a `tasks.json` in your lightly bucket stored at the subdirectory `.lightly/predictions/`.
-
-The `tasks.json` file must include a list of your task names which must match name
-of the subdirectory where your prediction schemas will be located.
-
-.. note::
-
- Only the task names listed within `tasks.json` will be considered.
- Please ensure that the task name corresponds with the location of your prediction schema.
- This allows you to specify which subfolder are considered by the Lightly Worker.
-
-For example, let's say we are working with the following folder structure:
-
-.. code-block:: bash
-
- .lightly/predictions/
- + tasks.json
- + classification_weather/
- + schema.json
- ...
- + classification_scenery/
- + schema.json
- ...
- + object_detection_people/
- + schema.json
- ...
- + semantic_segmentation_cars/
- + schema.json
- ...
- + some_directory_containing_irrelevant_things/
-
-
-we can specify which subfolders contain relevant predictions in the `tasks.json`:
-
-.. code-block:: javascript
- :caption: .lightly/predictions/tasks.json
-
- [
- "classification_weather",
- "classification_scenery",
- "object_detection_people",
- "semantic_segmentation_cars",
- ]
-
-.. note::
-
- If you list a subfolder which doesn't contain a valid `schema.json` file,
- the Lightly Worker will report an error! See below how to create a good `schema.json` file.
-
-
-Prediction Schema
------------------
-It's required to store a prediction schema. The schema defines the
-format of the predictions and helps the Lightly Platform to correctly identify
-and display classes. It also helps to prevent errors as all predictions which
-are loaded are validated against this schema.
-
-Every schema must include the type of the predictions for this task.
-For classification and object detection the prediction schema must also include
-all the categories and their corresponding ids. For other tasks, such as keypoint
-detection, it can be useful to store additional information like which keypoints
-are connected with each other by an edge.
-
-You can provide all this information to the Lightly Worker by adding a `schema.json` to the
-directory of the respective task. The schema.json file must have a key `categories`
-with a corresponding list of categories following the COCO annotation format.
-It must also have a key `task_type` indicating the type of the predictions.
-The `task_type` must be one of:
-
- - classification
- - object-detection
- - semantic-segmentation
-
-
-For example, let's say we are working with a classification model predicting the weather on an image.
-The three classes are sunny, clouded, and rainy.
-
-
-.. code-block:: javascript
- :caption: .lightly/predictions/classification_weather/schema.json
-
- {
- "task_type": "classification",
- "categories": [
- {
- "id": 0,
- "name": "sunny"
- },
- {
- "id": 1,
- "name": "clouded"
- },
- {
- "id": 2,
- "name": "rainy"
- }
- ]
- }
-
-
-
-Prediction Files
-----------------
-The Lightly Worker requires a **single prediction file per image**. The file should be a .json
-following the format defined under :ref:`prediction-format` and stored in the subdirectory
-`.lightly/predictions/${TASK_NAME}` in the storage bucket the dataset was configured with.
-In order to make sure the Lightly Worker can match the predictions to the correct source image,
-it's necessary to follow the naming convention:
-
-.. code-block:: bash
-
- # filename of the prediction for image input/bucket/FILENAME.EXT
- .lightly/predictions/${TASK_NAME}/${FILENAME}.json
-
- # example: input/bucket/subdir/image_1.png, classification
- .lightly/predictions/my_classification_task/subdir/image_1.json
-
- # example: input/bucket/image_0.png, classification
- .lightly/predictions/my_classification_task/image_0.json
-
-
-.. _prediction-files-for-videos:
-
-Prediction Files for Videos
----------------------------
-When working with videos, the Lightly Worker requires a prediction file per frame. Lightly
-uses a naming convention to identify frames: The filename of a frame consists of
-the video filename, the video format, and the frame number (padded to the length
-of the number of frames in the video) separated by hyphens. For example, for a
-video with 200 frames, the frame number will be padded to length three. For a video
-with 1000 frames, the frame number will be padded to length four (99 becomes 0099).
-
-.. code-block:: bash
-
- # filename of the predictions of the Xth frame of video input/bucket/FILENAME.EXT
- # with 200 frames (padding: len(str(200)) = 3)
- .lightly/predictions/${TASK_NAME}/${FILENAME}-${X:03d}-${EXT}.json
-
- # example: input/bucket/subdir/video_1.mp4, frame 99/200
- .lightly/predictions/my_classification_task/subdir/video_1-099-mp4.json
-
- # example: input/bucket/video_0.mp4, frame 99/200
- .lightly/predictions/my_classification_task/video_0-099-mp4.json
-
-See :ref:`creating-prediction-files-for-videos` on how to extract video frames
-and create predictions using `ffmpeg
`_ or Python.
-
-.. _prediction-format:
-
-Prediction Format
------------------
-Predictions for an image must have a `file_name` and `predictions`.
-Here, `file_name` serves as a unique identifier to retrieve the image for which
-the predictions are made and predictions is a list of `Prediction Singletons` for the corresponding task.
-
-- :code:`probabilities` are the per class probabilities of the prediction
-- :code:`score` is the final prediction score/confidence
-
-.. note:: Some frameworks only provide the score as the model output.
- The score is typically calculated during the Non-Max Suppression (NMS)
- by multiplying the objectness probability with the highest class probability.
-
- But having not only a single score, but also the class probabilities
- can be valuable information for
- active learning. For example, an object detection model could have a
- score of `0.6` and the predicted class is a tree. However, without
- class probabilities, we cannot know what the prediction margin or
- entropy is. With the class
- probabilities we would additionally know whether the model thought
- that it's `0.5` tree, `0.4` person and `0.1` car or `0.5` tree,
- `0.25` person and `0.25` car.
-
-
-
-Example classification:
-
-.. code-block:: javascript
- :caption: .lightly/predictions/classification_weather/subdir/image_1.json
-
- {
- "file_name": "subdir/image_1.png",
- "predictions": [ // classes: [sunny, clouded, rainy]
- {
- "category_id": 0,
- "probabilities": [0.8, 0.1, 0.1]
- }
- ]
- }
-
-Example object detection:
-
-.. code-block:: javascript
- :caption: .lightly/predictions/object_detection/subdir/image_1.json
-
- {
- "file_name": "subdir/image_1.png",
- "predictions": [ // classes: [person, car]
- {
- "category_id": 0,
- "bbox": [140, 100, 80, 90], // x, y, w, h coordinates in pixels
- "score": 0.8,
- "probabilities": [0.2, 0.8] // optional, sum up to 1.0
- },
- {
- "category_id": 1,
- "bbox": [...],
- "score": 0.9,
- "probabilities": [0.9, 0.1] // optional, sum up to 1.0
- },
- {
- "category_id": 0,
- "bbox": [...],
- "score": 0.5,
- "probabilities": [0.6, 0.4] // optional, sum up to 1.0
- }
- ]
- }
-
-Example semantic segmentation:
-
-.. code-block:: javascript
- :caption: .lightly/predictions/semantic_segmentation_cars/subdir/image_1.json
-
- {
- "file_name": "subdir/image_1.png",
- "predictions": [ // classes: [background, car, tree]
- {
- "category_id": 0,
- "segmentation": [100, 80, 90, 85, ...], //run length encoded binary segmentation mask
- "score": 0.8,
- "probabilities": [0.15, 0.8, 0.05] // optional, sum up to 1.0
- },
- {
- "category_id": 1,
- "segmentation": [...],
- "score": 0.9,
- "probabilities": [0.02, 0.08, 0.9] // optional, sum up to 1.0
- },
- ]
- }
-
-Note: The filename should always be the relative path from the root directory.
-
-
-Prediction Singletons
----------------------
-The prediction singletons closely follow the `COCO results
`_ format while dropping
-the `image_id`. Note the the `category_id` must be the same as the one defined
-in the schema and that the probabilities (if provided) must follow the order of the category ids.
-
-**Classification:**
-
-For classification, please use the following format:
-
-.. code-block:: javascript
-
- [{
- "category_id" : int,
- "probabilities" : [p0, p1, ..., pN] // optional, sum up to 1.0
- }]
-
-**Object Detection:**
-
-For detection with bounding boxes, please use the following format:
-
-.. code-block:: javascript
-
- [{
- "category_id" : int,
- "bbox" : [x, y, width, height], // coordinates in pixels
- "score" : float,
- "probabilities" : [p0, p1, ..., pN] // optional, sum up to 1.0
- }]
-
-The bounding box format follows the `COCO results `_ documentation.
-
-.. note::
-
- Bounding Box coordinates are pixels measured from the top left image corner.
-
-**Semantic Segmentation:**
-
-For semantic segmentation, please use the following format:
-
-.. code-block:: javascript
-
- [{
- "category_id" : int,
- "segmentation" : [int, int, ...], // run length encoded binary segmentation mask
- "score" : float,
- "probabilities" : [p0, p1, ..., pN] // optional, sum up to 1.0
- }]
-
-Each segmentation prediction contains the binary mask for one category and a
-corresponding score. The score determines the likelihood of the segmentation
-belonging to that category. Optionally, a list of probabilities can be provided
-containing a probability for each category, indicating the likeliness that the
-segment belongs to that category.
-
-To kickstart using the Lightly Worker with semantic segmentation predictions we created an
-example script that takes model predictions and converts them to the correct
-format :download:`semantic_segmentation_inference.py `
-
-Segmentations are defined with binary masks where each pixel is either set to 0
-or 1 if it belongs to the background or the object, respectively.
-The segmentation masks are compressed using run length encoding to reduce file size.
-Binary segmentation masks can be converted to the required format using the
-following function:
-
-.. code-block:: python
-
- import numpy as np
-
- def encode(binary_mask):
- """Encodes a (H, W) binary segmentation mask with run length encoding.
-
- The run length encoding is an array with counts of subsequent 0s and 1s
- in the binary mask. The first value in the array is always the count of
- initial 0s.
-
- Examples:
-
- >>> binary_mask = [
- >>> [0, 0, 1, 1],
- >>> [0, 1, 1, 1],
- >>> [0, 0, 0, 1],
- >>> ]
- >>> encode(binary_mask)
- [2, 2, 1, 3, 3, 1]
- """
- flat = np.concatenate(([-1], np.ravel(binary_mask), [-1]))
- borders = np.nonzero(np.diff(flat))[0]
- rle = np.diff(borders)
- if flat[1]:
- rle = np.concatenate(([0], rle))
- return rle.tolist()
-
-Segmentation models oftentimes output a probability for each pixel and category.
-Storing such probabilities can quickly result in large file sizes if the input
-images have a high resolution. To reduce storage requirements, Lightly Worker expects
-only a single score or probability per segmentation. If you have scores or
-probabilities for each pixel in the image, you have to first aggregate them
-into a single score/probability. We recommend to take either the median or mean
-score/probability over all pixels within the segmentation mask. The example
-below shows how pixelwise segmentation predictions can be converted to the
-format required by the Lightly Worker.
-
-.. code-block:: python
-
- # Make prediction for a single image. The output is assumed to be a tensor
- # with shape (categories, height, width).
- segmentation = model(image)
-
- # Most probable object category per pixel.
- category = segmentation.argmax(dim=0)
-
- # Convert to lightly predictions.
- predictions = []
- for category_id in category.unique():
- binary_mask = category == category_id
- median_score = segmentation[category_id, binary_mask].median()
- predictions.append({
- 'category_id': int(category_id),
- 'segmentation': encode(binary_mask),
- 'score': float(median_score),
- })
-
- prediction = {
- 'file_name': 'subdir/image_name.png',
- 'predictions': predictions,
- }
-
-
-.. note::
-
- Support for keypoint detection is coming soon!
-
-
-
-Creating the predictions folder from COCO
------------------------------------------
-
-For creating the predictions folder, we recommend writing a script that takes your predictions and
-saves them in the format just outlined. You can either save the predictions first on your local machine
-and then upload them to your datasource or save them directly to your datasource.
-
-As an example, the following script takes an object detection `COCO predictions file `_.
-It needs the path to the predictions file and the lightly directory
-where the `.lightly` folder should be created as input.
-Don't forget to change these 2 parameters at the top of the script.
-
-.. code-block:: python
-
- ### CHANGE THESE PARAMETERS
- output_filepath = "/path/to/create/.lightly/dir"
- annotation_filepath = "/path/to/_annotations.coco.json"
-
- ### Optionally change these parameters
- task_name = "my_object_detection_task"
- task_type = "object-detection"
-
- import json
- import os
- from pathlib import Path
-
- # create prediction directory
- path_predictions = os.path.join(output_filepath, '.lightly/predictions')
- Path(path_predictions).mkdir(exist_ok=True, parents=True)
-
- # Create task.json
- path_task_json = os.path.join(path_predictions, 'tasks.json')
- tasks = [task_name]
- with open(path_task_json, 'w') as f:
- json.dump(tasks, f)
-
- # read coco annotations
- with open(annotation_filepath, 'r') as f:
- coco_dict = json.load(f)
-
- # Create schema.json for task
- path_predictions_task = os.path.join(path_predictions, tasks[0])
- Path(path_predictions_task).mkdir(exist_ok=True)
- schema = {
- "task_type": task_type,
- "categories": coco_dict['categories']
- }
- path_schema_json = os.path.join(path_predictions_task, 'schema.json')
- with open(path_schema_json, 'w') as f:
- json.dump(schema, f)
-
- # Create predictions themselves
- image_id_to_prediction = dict()
- for image in coco_dict['images']:
- prediction = {
- 'file_name': image['file_name'],
- 'predictions': [],
- }
- image_id_to_prediction[image['id']] = prediction
- for ann in coco_dict['annotations']:
- pred = {
- 'category_id': ann['category_id'],
- 'bbox': ann['bbox'],
- 'score': ann.get('score', 0)
- }
- image_id_to_prediction[ann['image_id']]['predictions'].append(pred)
-
- for prediction in image_id_to_prediction.values():
- filename_prediction = os.path.splitext(prediction['file_name'])[0] + '.json'
- path_to_prediction = os.path.join(path_predictions_task, filename_prediction)
- with open(path_to_prediction, 'w') as f:
- json.dump(prediction, f)
-
-
-.. _creating-prediction-files-for-videos:
-
-Creating Prediction Files for Videos
--------------------------------------
-
-The Lightly Worker expects one prediction file per frame in a video. Predictions can be
-created following the Python example code below. Make sure that `PyAV `_
-is installed on your system for it to work correctly.
-
-.. literalinclude:: code_examples/python_create_frame_predictions.py
-
-.. warning::
-
- It is discouraged to use another library than `PyAV `_
- for loading videos with Python as the order and number of loaded frames
- might differ.
-
-
-Extracting Frames with FFMPEG
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Alternatively, frames can also first be extracted as images with `ffmpeg `_
-and then further processed by any prediction model that supports images.
-The example command below shows how to extract frames and save them with the
-:ref:`filename expected by Lightly `. Make sure that
-`ffmpeg `_ is installed on your system before running the
-command.
-
-.. code:: bash
-
- VIDEO=video.mp4; NUM_FRAMES=$(ffprobe -v error -select_streams v:0 -count_frames -show_entries stream=nb_read_frames -of csv=p=0 ${VIDEO}); ffmpeg -r 1 -i ${VIDEO} -start_number 0 ${VIDEO%.mp4}-%0${#NUM_FRAMES}d-mp4.png
-
- # results in the following file structure
- .
- ├── video.mp4
- ├── video-000-mp4.png
- ├── video-001-mp4.png
- ├── video-002-mp4.png
- ├── video-003-mp4.png
- └── ...
diff --git a/docs/source/docker/advanced/images/bdd100k_demo_metadata.jpg b/docs/source/docker/advanced/images/bdd100k_demo_metadata.jpg
deleted file mode 100644
index 552140d0d..000000000
Binary files a/docs/source/docker/advanced/images/bdd100k_demo_metadata.jpg and /dev/null differ
diff --git a/docs/source/docker/advanced/images/object_level_vehicle_car_cluster.jpg b/docs/source/docker/advanced/images/object_level_vehicle_car_cluster.jpg
deleted file mode 100644
index ced2c818f..000000000
Binary files a/docs/source/docker/advanced/images/object_level_vehicle_car_cluster.jpg and /dev/null differ
diff --git a/docs/source/docker/advanced/images/object_level_vehicle_crops_examples.jpg b/docs/source/docker/advanced/images/object_level_vehicle_crops_examples.jpg
deleted file mode 100644
index 3acbdb83d..000000000
Binary files a/docs/source/docker/advanced/images/object_level_vehicle_crops_examples.jpg and /dev/null differ
diff --git a/docs/source/docker/advanced/images/object_level_vehicle_examples.jpg b/docs/source/docker/advanced/images/object_level_vehicle_examples.jpg
deleted file mode 100644
index ee61b2b9e..000000000
Binary files a/docs/source/docker/advanced/images/object_level_vehicle_examples.jpg and /dev/null differ
diff --git a/docs/source/docker/advanced/images/object_level_vehicle_motorbike_cluster.jpg b/docs/source/docker/advanced/images/object_level_vehicle_motorbike_cluster.jpg
deleted file mode 100644
index a38ba6fb1..000000000
Binary files a/docs/source/docker/advanced/images/object_level_vehicle_motorbike_cluster.jpg and /dev/null differ
diff --git a/docs/source/docker/advanced/images/object_level_vehicle_truck_cluster.jpg b/docs/source/docker/advanced/images/object_level_vehicle_truck_cluster.jpg
deleted file mode 100644
index 06cbc9f6d..000000000
Binary files a/docs/source/docker/advanced/images/object_level_vehicle_truck_cluster.jpg and /dev/null differ
diff --git a/docs/source/docker/advanced/images/sequence_selection_pca.png b/docs/source/docker/advanced/images/sequence_selection_pca.png
deleted file mode 100644
index c2d222ceb..000000000
Binary files a/docs/source/docker/advanced/images/sequence_selection_pca.png and /dev/null differ
diff --git a/docs/source/docker/advanced/images/webapp-embedding-after-2nd-docker.png b/docs/source/docker/advanced/images/webapp-embedding-after-2nd-docker.png
deleted file mode 100644
index 1a9d19e3d..000000000
Binary files a/docs/source/docker/advanced/images/webapp-embedding-after-2nd-docker.png and /dev/null differ
diff --git a/docs/source/docker/advanced/load_model_from_checkpoint.rst b/docs/source/docker/advanced/load_model_from_checkpoint.rst
deleted file mode 100644
index 58c28c5e4..000000000
--- a/docs/source/docker/advanced/load_model_from_checkpoint.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-.. _load-model-from-checkpoint:
-
-Load Model from Checkpoint
-==========================
-
-The Lightly Worker can be used to :ref:`train a self-supervised model on your data. `
-Lightly Worker saves the weights of the model after training to a checkpoint file in
-:code:`output_dir/lightly_epoch_X.ckpt`. This checkpoint can then be further
-used to, for example, train a classifier model on your dataset. The code below
-demonstrates how the checkpoint can be loaded:
-
-.. literalinclude:: code_examples/load_model_from_checkpoint.py
diff --git a/docs/source/docker/advanced/object_level.rst b/docs/source/docker/advanced/object_level.rst
deleted file mode 100644
index 5b96289ec..000000000
--- a/docs/source/docker/advanced/object_level.rst
+++ /dev/null
@@ -1,268 +0,0 @@
-.. _docker-object-level:
-
-Object Level
-============
-The Lightly Worker does not only work on full images but also on an object level. This
-workflow is especially useful for datasets containing small objects or multiple
-objects in each image and provides the following benefits over the full image
-workflow:
-
-- Analyze a dataset based on individual objects
-- Find a diverse set of objects in the dataset
-- Find images that contain objects of interest
-- Full control over type of objects to process
-- Ignore uninteresting background regions in images
-- Automatic cropping of objects from the original image
-
-.. note:: Note that the object level features require a minimum
- Lightly Worker of version 2.2. You can check your installed version of the
- Lightly Worker by running the :ref:`docker-setup-sanity-check`.
-
-
-Prerequisites
--------------
-In order to use the object level workflow with the Lightly Worker, you will need the
-following things:
-
-- The installed Lightly Worker (see :ref:`docker-setup`)
-- A dataset with a configured datasource (see :ref:`ref-docker-with-datasource-datapool`)
-- Object detection predictions uploaded to the datasource (see next section)
-
-
-.. note::
-
- If you don't have any predictions available, you can use the Lightly Worker pretagging
- model. See :ref:`Pretagging ` for more information.
-
-
-Predictions
------------
-The Lightly Worker needs to know which objects to process. This information is provided
-by uploading a set of object predictions to the datasource (see :ref:`docker-datasource-predictions`).
-Let's say we are working with a dataset containing different types of vehicles
-and used an object detection model to find possible vehicle objects in the
-dataset. Then the file structure of the datasource should look like this:
-
-.. code-block:: bash
-
- datasource/vehicles_dataset/
- + .lightly/predictions/
- + tasks.json
- + vehicles_object_detections/
- + schema.json
- + image_1.json
- ...
- + image_N.json
- + image_1.png
- + image_2.png
- ...
- + image_N.png
-
-
-The following files should be added to the *.lightly/predictions*
-directory in the datasource:
-
-- A *tasks.json* file that contains the name of the subdirectory in which the
- prediction files are stored.
-
- .. code-block::
- :caption: .lightly/predictions/tasks.json
-
- [
- "vehicles_object_detections"
- ]
-
-- A *schema.json* file that specifies that the predictions are from an
- *object-detection* task and a list of all possible object categories.
-
- .. code-block:: javascript
- :caption: .lightly/predictions/vehicles_object_detections/schema.json
-
- {
- "task_type": "object-detection",
- "categories": [
- {
- "id": 0,
- "name": "car",
- },
- {
- "id": 1,
- "name": "truck",
- },
- {
- "id": 2,
- "name": "motorbike",
- }
- ]
- }
-
-- And for each image, or video frame, in the dataset an *IMAGE_NAME.json* file
- which holds the predictions the object detection model made for the given image:
-
- .. code-block:: javascript
- :caption: .lightly/predictions/vehicles_object_detections/image_1.json
-
- {
- "file_name": "image_1.png",
- "predictions": [
- {
- "category_id": 1,
- "bbox": [...],
- "score": 0.8
- },
- {
- "category_id": 0,
- "bbox": [...],
- "score": 0.9
- },
- {
- "category_id": 2,
- "bbox": [...],
- "score": 0.5
- }
- ]
- }
-
-For more information regarding the predictions format please see :ref:`docker-datasource-predictions`.
-
-
-Selection on Object Level
--------------------------
-Once you have everything set up as described above, you can run selection on
-object level by setting the `object_level.task_name` argument in the :ref:`docker configuration `.
-The argument should be set to the task name you used for your predictions.
-If you uploaded the predictions to e.g. `.lightly/predictions/vehicles_object_detections`
-then you should set `object_level.task_name` to `vehicles_object_detections`.
-
-The object level job can be scheduled from from Python code.
-
-.. literalinclude:: code_examples/python_run_object_level.py
-
-After running the Python script to create the job we need to make sure we have
-a running Lightly Worker to process the job. We can use the following
-code to sping up a Lightly Worker
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --rm --gpus all -it \
- lightly/worker:latest \
- token=YOUR_TOKEN worker.worker_id=YOUR_WORKER_ID
-
-.. _object-level-pretagging:
-
-Lightly Pretagging
-------------------
-Instead of providing your own predictions, it's also possible to use the built-in pretagging model from Lightly. To do so,
-set `pretagging=True` in your config and use the `object_level.task_name="lightly_pretagging"`. For more information
-about the prediction model and classes, go to :ref:`Lightly Pretagging Model `
-
-.. literalinclude:: code_examples/python_run_object_level_pretagging.py
-
-After running the Python script to create the job we need to make sure we have
-a running Lightly Worker to process the job. We can use the following
-code to sping up a Lightly Worker
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --rm --gpus all -it \
- lightly/worker:latest \
- token=YOUR_TOKEN worker.worker_id=YOUR_WORKER_ID
-
-Padding
--------
-The Lightly Worker makes it possible to add a padding around your bounding boxes. This allows
-for better visualization of the cropped images in the web-app and can improve the
-embeddings of the objects as the embedding model sees the objects in context. To add
-padding, simply specify `object_level.padding=X` where `X` is the padding relative
-to the bounding box size. For example, a padding of `X=0.1` will extend both width and
-height of all bounding boxes by 10 percent.
-
-
-Object Crops Dataset
---------------------
-Once the Lightly Worker job is started it fetches all images and predictions from the
-remote datasource and processes them. For each prediction, the Lightly Worker crops
-the object from the full image and creates an embedding for it. Then it selects
-a subset of the objects and uploads **two** datasets to the Lightly Platform:
-
-1. The crops and embeddings of the selected objects are uploaded to an object
- *crops* dataset on the platform. By default, the dataset has the same name as
- the original image dataset but with a "-crops" suffix appended to it.
- Alternatively, you can also choose a custom dataset name by setting
- the `object_level.crop_dataset_name` config option.
-2. If an object is selected, then the full image containing that object is
- also uploaded. You can find these images in the original dataset from which
- you started the selection job.
-
-You can see example images of the two datasets below.
-
-Object Crop Dataset:
-
-.. figure:: images/object_level_vehicle_crops_examples.jpg
-
-
-Original Full Image Dataset:
-
-.. figure:: images/object_level_vehicle_examples.jpg
-
-
-Analyzing the Crop Dataset
---------------------------
-The crop dataset allows you to analyze your data on an object level. In our
-vehicles dataset we could, for example, be interested in the diversity of the
-vehicles. If we go to our crops dataset and select the *Embedding* view in the
-menu, we can see that crops are roughly grouped by vehicle type:
-
-Cars:
-
-.. figure:: images/object_level_vehicle_car_cluster.jpg
-
-Trucks:
-
-.. figure:: images/object_level_vehicle_truck_cluster.jpg
-
-Motorbikes:
-
-.. figure:: images/object_level_vehicle_motorbike_cluster.jpg
-
-
-This can be a very efficient way to get insights into your data without the need
-for human annotations. The embedding view allows you dig deeper into the
-properties of your dataset and reveal things like:
-
-- Q: What sort of special trucks do we have?
- A: There are a lot of ambulances and school buses.
-- Q: Are there also vans in the dataset?
- A: There are only few of them, we should try to get more images containing vans.
-- Q: Are there images of cars in different weather conditions?
- A: Most images seem to be taken in sunny weather with good lightning conditions.
-
-These hidden biases are hard to find in a dataset if you only rely on full
-images or the coarse vehicle type predicted by the object detection model.
-The Lightly Worker helps you to identify them quickly and assists you in monitoring and
-improving the quality of your dataset. After an initial exploration you can now
-take further steps to enhance the dataset using one of the workflows the Lightly Worker
-provides:
-
-- Select a subset of your data using our :ref:`Sampling Algorithms `
-- Select new samples to add to your dataset using :ref:`Active Learning `
-- Prepare images for labelling by :ref:`exporting them to LabelStudio `
-
-
-Multiple Object Level Runs
---------------------------
-You can run multiple object level workflows using the same dataset. To start a
-new run, please select your original full image dataset in the Lightly Platform
-and schedule a new run from there. If you are running the Lightly Worker from Python or
-over the API, you have to set the `dataset_id` configuration option to the id of
-the original full image dataset. In both cases make sure that the run is *not*
-started from the crops dataset as this is not supported!
-
-You can control to which crops dataset the newly selected object crops are
-uploaded by setting the `object_level.crop_dataset_name` configuration option.
-By default this option is not set and if you did not specify it in the first run,
-you can also omit it in future runs. In this case the Lightly Worker will automatically
-find the existing crops dataset and add the new crops to it. If you want to
-upload the crops to a new dataset or have set a custom crop dataset name in a
-previous run, then set the `object_level.crop_dataset_name` option to a new
-or existing dataset name, respectively.
diff --git a/docs/source/docker/advanced/overview.rst b/docs/source/docker/advanced/overview.rst
deleted file mode 100644
index 3057c5448..000000000
--- a/docs/source/docker/advanced/overview.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-Advanced
-===================================
-Here you learn more advanced usage patterns of Lightly Worker.
-
-
-.. toctree::
- :maxdepth: 2
-
- datapool.rst
- pretagging.rst
- datasource_predictions.rst
- datasource_metadata.rst
- sequence_selection.rst
- object_level.rst
- load_model_from_checkpoint.rst
diff --git a/docs/source/docker/advanced/pretagging.rst b/docs/source/docker/advanced/pretagging.rst
deleted file mode 100644
index c5ae6666e..000000000
--- a/docs/source/docker/advanced/pretagging.rst
+++ /dev/null
@@ -1,117 +0,0 @@
-.. _docker-pretagging:
-
-Pretagging
-======================
-
-Lightly Worker supports the use of pre-trained models to tag the dataset. We
-call this pretagging. For now, we offer a pre-trained model for object detection
-optimized for autonomous-driving.
-
-Using a pretrained model does not resolve the need for high quality human annotations.
-However, we can use the model predictions to get some idea of the underlying
-distribution within the dataset.
-
-The model is capable of detecting the following core classes:
-
- - bicycle
- - bus
- - car
- - motorcycle
- - person
- - train
- - truck
-
-
-How It Works
----------------
-
-Our pretagging model is based on a FasterRCNN model with a ResNet-50 backbone.
-The model has been trained on a dataset consisting of ~100k images.
-
-The results of pretagging are visualized in the report. We report both, the
-object distribution before and after the selection process.
-
-The following image shows an example of such a histogram for the input data
-before filtering.
-
-.. figure:: ../resources/pretagging_histogram_example.png
- :align: center
- :alt: some alt text
-
- Histogram plot of the pretagging model for the input data (full dataset).
- The plot shows the distribution of the various detected classes.
- Further it shows the average number of objects per image.
-
-For every Lightly Worker run with pretagging enabled we also dump all model predictions
-into a json file with the following format:
-
-.. code-block:: javascript
-
- // boxes have format x1, y1, x2, y2
- [
- {
- "filename": "0000000095.png",
- "boxes": [
- [
- 0.869,
- 0.153,
- 0.885,
- 0.197
- ],
- [
- 0.231,
- 0.175,
- 0.291,
- 0.202
- ]
- ],
- "labels": [
- "person",
- "car"
- ],
- "scores": [
- 0.9845203757286072,
- 0.9323102831840515
- ]
- },
- ...
- ]
-
-
-Usage
----------------
-
-Pretagging can be activated by passing the following argument to your
-Lightly Worker config: :code:`'pretagging': True`
-
-- :code:`'pretagging': True` enables the use of the pretagging model
-- :code:`'pretagging_debug': True` add a few images to the report for debugging showing the image with the bounding box predictions.
-
-
-A full Python script showing how to create such as job is shown here:
-
-.. literalinclude:: ./code_examples/python_run_pretagging.py
- :linenos:
- :emphasize-lines: 75-76
- :language: python
-
-
-After running the Python script to create the job we need to make sure we have
-a running Lightly Worker to process the job. We can use the following
-code to sping up a Lightly Worker
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --rm --gpus all -it \
- lightly/worker:latest \
- token=YOUR_TOKEN worker.worker_id=YOUR_WORKER_ID
-
-The following shows an example of how the debugging images in the report look like:
-
-.. figure:: ../resources/pretagging_debug_example.png
- :align: center
- :alt: some alt text
-
- The plot shows the detected bounding boxes from the pretagging overlayed
- on the image. Use the debug feature to figure out whether the pretagging
- mechanism works properly on your dataset.
diff --git a/docs/source/docker/advanced/sequence_selection.rst b/docs/source/docker/advanced/sequence_selection.rst
deleted file mode 100644
index e60089243..000000000
--- a/docs/source/docker/advanced/sequence_selection.rst
+++ /dev/null
@@ -1,218 +0,0 @@
-.. _sequence-selection:
-
-Sequence Selection
-==================
-
-Sequence selection allows users to select sequences of a video instead of single frames.
-The key concept is the parameter `selected_sequence_length`. If its value is one (default),
-the docker selects single frames. If it is larger than one, each video is split into
-sequences of that length and the frame representations are aggregated into a sequence
-representation. The selection then happens on these sequence representations.
-
-.. note:: Sequence selection works on videos or on folders of alphabetically sorted
- frames.
-
-
-How It Works
--------------
-Sequence selection consists of the following steps:
-
-1. Each input video is split into sequences of length `selected_sequence_length`.
-2. Next, the embeddings of all frames in a sequence are aggregated (averaged).
-3. The selection is performed on sequence level.
-4. Finally, the indices of the selected sequence frames are reconstructed.
-5. Information about the selected sequences is saved in the output directory.
-6. The report is generated and (if requested) the selected frames are saved.
-
-
-Usage
------------
-
-To select sequences of length **X** simply add the argument **selected_sequence_length=X**
-to your Lightly Worker run command. Hereby, **X** must be an integer number which evenly divides
-the **stopping_condition.n_samples**. If **stopping_condition.n_samples** is a fraction,
-the Lightly Worker will automatically round it to the next multiple of of **X**.
-
-
-For example, let's say we have a folder with two videos
-which we randomly downloaded from `Pexels `_:
-
-.. code-block:: console
-
- ls /datasets/pexels/
- > Pexels_Videos_1409899.mp4 Pexels_Videos_2495382.mp4
-
-Now, we want to select sequences of length ten. We can use the following script:
-
-.. literalinclude:: code_examples/python_run_sequence_selection.py
-
-The above script will create a run to select 20 sequences each consisting of ten frames. The selected
-frames are then saved in the output directory for further processing. Note that Lightly Worker
-currently doesn't support the corruptness check and removing exact duplicates for
-sequence selection. Hence we have to deactivate them in the command above.
-
-To make sure our run gets processed we need to make sure we have a Lightly Worker
-running:
-
-.. code-block:: console
-
- docker run--shm-size="1024m" --rm --gpus all -it \
- -v /docker-output:/home/output_dir lightly/worker:latest \
- token=YOUR_TOKEN worker.worker_id=YOUR_WORKER_ID
-
-.. warning:: The stopping condition `n_samples` must be equal to to the number of
- desired sequences times the `selected_sequence_length`, i.e. **n_samples = n_sequences x selected_sequence_length**.
- In the example above 20 sequences times ten frames is exactly 200.
-
-
-In our example, a look at a PCA of the embeddings of the selected frames nicely shows
-the 20 selected sequences. The following image is taken from the output of the
-Lightly Worker:
-
-.. figure:: images/sequence_selection_pca.png
- :align: center
- :alt: PCA of embeddings of frames
- :figwidth: 80%
-
- PCA of the embeddings of the frames in the selected sequences from the two
- input videos (yellow and purple).
-
-
-Sequence Selection Information
-------------------------------
-
-The Lightly Worker will create a file at `{docker-output}/data/sequence_information.json`
-containing detailed information about the selected sequences. The file can be used
-for further analysis of your dataset based on sequences.
-
-The file contains a list of sequence dictionaries. Every dicionary lists the
-exact contents for one sequence. In the case of video frame sequences the
-`sequence_information.json` will look similar to the example shown below:
-
-.. code:: json
-
- [
- {
- "video_name": "Pexels_Videos_1409899.mp4",
- "frame_names": [
- "Pexels_Videos_1409899-40-mp4.png",
- "Pexels_Videos_1409899-41-mp4.png",
- "Pexels_Videos_1409899-42-mp4.png",
- ...
- ],
- "frame_timestamps_pts": [
- 359726680,
- 368719847,
- 377713014,
- ...
- ],
- "frame_timestamps_sec": [
- 4.886145,
- 5.008298625,
- 5.13045225,
- ...
- ],
- "frame_indices": [
- 40,
- 41,
- 42,
- ...
- ]
- },
- {
- "video_name": "Pexels_Videos_1409899.mp4",
- "frame_names": [
- "Pexels_Videos_1409899-100-mp4.png",
- "Pexels_Videos_1409899-101-mp4.png",
- "Pexels_Videos_1409899-102-mp4.png",
- ...
- ],
- "frame_timestamps_pts": [
- 422678849,
- 431672016,
- 440665183,
- ...
- ],
- "frame_timestamps_sec": [
- 6.095856060606061,
- 6.217773181818182,
- 6.339690303030303,
- ...
- ],
- "frame_indices": [
- 100,
- 101,
- 102,
- ...
- ]
- },
- ...
- ]
-
-
-For image file sequences it only lists the filenames for every sequence:
-
-.. code:: json
-
- [
- {
- "filenames": [
- "image_40.png",
- "image_41.png",
- "image_42.png",
- ...
- ]
- },
- {
- "filenames": [
- "image_100.png",
- "image_101.png",
- "image_102.png",
- ...
- ]
- },
- ...
- ]
-
-
-Cropping Sequences From Videos
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Using the timestamps stored in the `sequence_information.json` file, the selected
-video sequences can be cropped from the original videos. Make sure that
-`ffmpeg `_ is available on your system for cropping the videos.
-
-There are two types of stored timestamps:
-
-* `frame_timestamps_pts`: Presentation timestamps in timebase units of the video.
-* `frame_timestamps_sec`: Presentation timestamps in seconds.
-
-To crop a sequence, the first and last timestamp from the `frame_timestamps_pts`
-list and the `video_name` stored in the `sequence_information.json` file are
-required. The cropping can be done with the following command using an
-`ffmpeg trim filter `_:
-
-.. code:: console
-
- ffmpeg -i {VIDEO_NAME} -copyts -filter "trim=start_pts={FIRST_TIMESTAMP_PTS}:end_pts={LAST_TIMESTAMP_PTS + 1}" {SEQUENCE_NAME}
-
- # example using the videos from above
- ffmpeg -i Pexels_Videos_1409899.mp4 -copyts -filter "trim=start_pts=359726680:end_pts=377713015" sequence_1.mp4
-
-.. warning::
-
- Make sure that `end_pts` is set to `LAST_TIMESTAMP + 1` otherwise the last
- frame in the sequence will not be included in the cropped video!
-
-Sequences can also be cropped using the first and last timestamp from the `frame_timestamps_sec`
-list. However, depending on the video and sequence, this can result in the last frame
-of the sequence not being included in the cropped video. We recommend to use
-`frame_timestamps_pts` if possible. The following command can be used for cropping using
-`frame_timestamps_sec`:
-
-.. code:: console
-
- ffmpeg -i {VIDEO_NAME} -copyts -filter "trim=start={FIRST_TIMESTAMP_SEC}:end={LAST_TIMESTAMP_SEC}" {SEQUENCE_NAME}
-
- # example using the videos from above
- ffmpeg -i Pexels_Videos_1409899.mp4 -copyts -filter "trim=start=4.886145:end=5.985527625" sequence_1.mp4
diff --git a/docs/source/docker/code_examples/webapp_default_lightly_config.txt b/docs/source/docker/code_examples/webapp_default_lightly_config.txt
deleted file mode 100644
index daefb7809..000000000
--- a/docs/source/docker/code_examples/webapp_default_lightly_config.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-{
- loader: {
- batch_size: 16,
- shuffle: true,
- num_workers: -1,
- drop_last: true
- },
- model: {
- name: 'resnet-18',
- out_dim: 128,
- num_ftrs: 32,
- width: 1
- },
- trainer: {
- gpus: 1,
- max_epochs: 100,
- precision: 32
- },
- criterion: {
- temperature: 0.5
- },
- optimizer: {
- lr: 1,
- weight_decay: 0.00001
- },
- collate: {
- input_size: 64,
- cj_prob: 0.8,
- cj_bright: 0.7,
- cj_contrast: 0.7,
- cj_sat: 0.7,
- cj_hue: 0.2,
- min_scale: 0.15,
- random_gray_scale: 0.2,
- gaussian_blur: 0.5,
- kernel_size: 0.1,
- vf_prob: 0,
- hf_prob: 0.5,
- rr_prob: 0
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker/code_examples/webapp_default_worker_config.txt b/docs/source/docker/code_examples/webapp_default_worker_config.txt
deleted file mode 100644
index 28c29ad62..000000000
--- a/docs/source/docker/code_examples/webapp_default_worker_config.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-{
- enable_corruptness_check: true,
- remove_exact_duplicates: true,
- enable_training: false,
- pretagging: false,
- pretagging_debug: false,
- method: 'coreset',
- stopping_condition: {
- n_samples: 0.1,
- min_distance: -1
- },
- scorer: 'object-frequency',
- scorer_config: {
- frequency_penalty: 0.25,
- min_score: 0.9
- },
- active_learning: {
- task_name: '',
- score_name: 'uncertainty_margin'
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker/configuration/configuration.rst b/docs/source/docker/configuration/configuration.rst
deleted file mode 100644
index 241e4766f..000000000
--- a/docs/source/docker/configuration/configuration.rst
+++ /dev/null
@@ -1,223 +0,0 @@
-.. _docker-configuration:
-
-Configuration
-===================================
-
-Similar to the
-`lightly open-source framework `_, the Lightly Worker solution
-can be configured using Hydra as well.
-
-The following example shows how the `token` parameter is set.
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --rm -it \
- -v {OUTPUT_DIR}:/home/shared_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN
-
-Note that `token` can optionally be passed as a `LIGHTLY_TOKEN` environment variable to keep it hidden from logs:
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --rm -it \
- -e LIGHTLY_TOKEN=MYAWESOMETOKEN
- -v {OUTPUT_DIR}:/home/shared_dir \
- lightly/worker:latest
-
-.. _rst-docker-parameters:
-
-List of Parameters
------------------------------------
-
-The following are parameters which can be passed to the container:
-
-.. code-block:: yaml
-
- # Access token, get it from app.lightly.ai.
- token: ''
-
- worker:
- # If specified, the docker is started as a worker on the Lightly Platform.
- worker_id: ''
- # If True, the worker notifies that it is online even though another worker
- # with the same worker_id is already online.
- # This can be useful if that worker is actually offline but was not able to properly shutdown due to it crashing.
- # If False and already a worker exists, the docker aborts.
- force_start: True
-
- # Set to True to check whether installation was successful.
- sanity_check: False
-
- # Path to a file containing filenames to run the Lightly Worker on a subset of the
- # files in the input directory (local folder or datasource path).
- # The Lightly Worker will ignore all files in the input
- # directory not listed here. Each filename must be on a separate line and
- # relative to the input directory.
- # If you use a cloud bucket as datasource, the path is relative
- # to the root of your input datasource. If you specified a 2nd output datasource,
- # and the path contains `.lightly`, then the path is relative to the output datasource.
- # If you have a local input directory, the path is relative to the shared directory,
- # so if the file is in 'shared_dir/directory/relevant_filenames.txt'
- # the path should be set to 'directory/relevant_filenames.txt'
- relevant_filenames_file: ''
-
- # Set to False to disable check for corrupted images.
- enable_corruptness_check: True
- corruptness_check:
- # Threshold in [0, 1] which determines the sensibility of the corruptness check
- # for video frames. Every frame which has an internally computed corruptness
- # score larger than the specified threshold will be classified as corrupted.
- corruption_threshold: 0.1
-
- # Remove exact duplicates.
- remove_exact_duplicates: True
-
- # Path to the checkpoint relative to the shared directory.
- checkpoint: ''
-
- # Path to the embeddings file relative to the shared directory.
- embeddings: ''
-
- # Enable training, only possible when no embeddings are passed.
- enable_training: False
-
- # Dump the final dataset to the output directory.
- dump_dataset: False
- dump_sampled_embeddings: True
- # Set the size of the dumped images, use =x or =[height,width] to match the
- # shortest edge to x or to resize the image to (height, width), use =-1 for no
- # resizing (default). This only affects the output size of the images dumped to
- # the output folder with dump_dataset=True. To change the size of images
- # uploaded to the lightly Platform or your cloud bucket please use the
- # lightly.resize option instead.
- output_image_size: -1
- output_image_format: 'png'
-
- # Upload the dataset to the Lightly Platform.
- upload_dataset: False
-
- # pretagging
- pretagging: False
- pretagging_debug: False
- pretagging_config: ''
- pretagging_upload: False
-
- # Append weak labels.
- append_weak_labels: False
-
- # Normalize the embeddings to unit length.
- normalize_embeddings: True
-
- # active learning scorer
- scorer: 'object-frequency'
- scorer_config:
- frequency_penalty: 0.25
- min_score: 0.9
-
-
- # Selection
- # Choose from ['coreset', 'random'].
- method: 'coreset'
- # Choose when to stop the selection
- stopping_condition:
- # The maximum number of samples selected
- # Float in [0., 1.] for percentage, int for number of samples, -1 means inactive.
- n_samples: -1
- # Float, minimum distance between two selected images. -1 means inactive.
- min_distance: -1.
- selected_sequence_length: 1
-
- # datapool
- datapool:
- # Name of the datapool. This will create a local datapool.
- name:
- # If True keeps backup of all previous data pool states.
- keep_history: True
- # Dataset id from Lightly Platform where the datapool should be hosted.
- dataset_id:
-
- # datasource
- # By default only new samples in the datasource are processed. Set process_all
- # to True to reprocess all samples in the datasource.
- datasource:
- # Dataset id from the Lightly Platform.
- dataset_id:
- # Set to True to reprocess all samples in the datasource.
- process_all: False
- # Update datapool with the selected samples.
- enable_datapool_update: True
- # Use video metadata to determine the number of frames in each video. Set to
- # True for faster processing. Set to False if you get video related errors.
- use_frame_count_metadata: False
- # This feature flag enables runs which take longer than 7 days by bypassing
- # the limitation of signed read URLs of S3, GCS and Azure.
- # The tradeoff is that it will take longer to fully read and process all the
- # data which is stored in the bucket configured as your datasource resulting
- # in a longer total duration.
- # Only enable this if you are certain that your run will take longer than
- # 7 days to complete.
- # This feature is always enabled when a S3 datasource with delegated access
- # is configured.
- use_redirected_read_url: False
- # Bypass the verification of read/write access to the datasource
- bypass_verify: False
-
-
- # active learning
- active_learning:
- task_name: ''
- score_name: 'uncertainty_margin'
-
- # object level
- object_level:
- # Name of the object prediction task which contains all the object annotations.
- task_name: ''
- # Name of the additional crop dataset on the Lightly Platform. A new dataset
- # is created if no dataset with this name exists.
- crop_dataset_name: ''
- # Padding relative to the bbox size
- padding: 0.0
-
- # Upload report to the Ligthly platform.
- upload_report: True
- # The number of retained/removed image pairs shown in the report.
- n_example_images: 6
- # Maximum size of the distance matrix allowed for report statistics in GB.
- memory_requirement_in_GB: 2
- # Show timestamps of the selected frames for each video in the report. Set this
- # to False if you observe slow report generation or work with many videos (>20).
- show_video_sampling_timeline: True
-
- # optional deterministic unique output subdirectory for run, in place of timestamp
- run_directory:
-
-To get an overview of all possible configuration parameters of the Lightly Worker,
-please check out :ref:`ref-cli-config-default`
-
-Choosing the Right Parameters
------------------------------------
-
-Below you find some distributions and the resulting histogram of the pairwise
-distances. Typically, datasets consist of multiple normal or uniform
-distributions (second row). This makes sense. In autonomous driving, we collect
-data in various cities, different weather conditions, or other factors. When
-working with video data from multiple cameras each camera might form a cluster
-since images from the same static camera have lots of perceptual similarity.
-
-The more interesting question is what kind of distribution you're aiming for.
-
-
-**If we want to diversify the dataset** (e.g. create a really hard test set
-covering all the special cases) we might want to aim for what looks like a grid.
-The log histogram (yes, we plot the histograms in log scale!) for a grid pattern with
-equal distance between two neighboring samples looks like a D.
-
-
-**If you want to remove nearby duplicates** (e.g. reduce overfitting and bias)
-we see good results when trying to sample using the *min_distance* stop condition.
-E.g. set the *min_distance* to 0.1 to get rid of the small peak (if there is any)
-close to 0 pairwise distance.
-
-
-.. image:: images/histograms_overview.png
diff --git a/docs/source/docker/configuration/images/histograms_overview.png b/docs/source/docker/configuration/images/histograms_overview.png
deleted file mode 100644
index 4e252a1ff..000000000
Binary files a/docs/source/docker/configuration/images/histograms_overview.png and /dev/null differ
diff --git a/docs/source/docker/examples/academic_datasets.rst b/docs/source/docker/examples/academic_datasets.rst
deleted file mode 100644
index 72b9832af..000000000
--- a/docs/source/docker/examples/academic_datasets.rst
+++ /dev/null
@@ -1,243 +0,0 @@
-ImageNet
-========
-
-Let's have a look at how to run the Lightly Worker to analyze and filter the famous
-ImageNet dataset. We are assuming here that the ImageNet dataset is located in an S3
-bucket under `s3://dataset/imagenet/`. Start by creating a dataset and configuring the datasource
-
-.. note:: For all examples we assume that the Lightly Worker is configured and running. See :ref:`docker-setup` for more information.
-
-
-.. code-block:: python
-
- from lightly.api import ApiWorkflowClient
- from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
- from lightly.openapi_generated.swagger_client.models.datasource_purpose import DatasourcePurpose
-
- # Create the Lightly client to connect to the API.
- client = ApiWorkflowClient(token="MY_AWESOME_TOKEN")
-
- # Create a new dataset on the Lightly Platform.
- client.create_new_dataset_with_unique_name(
- 'imagenet-example',
- DatasetType.IMAGES,
- )
-
- ## AWS S3
- # Input bucket
- client.set_s3_config(
- resource_path="s3://dataset/imagenet/",
- region='eu-central-1',
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- thumbnail_suffix=".lightly/thumbnails/[filename]_thumb.[extension]",
- purpose=DatasourcePurpose.INPUT
- )
- # Output bucket
- client.set_s3_config(
- resource_path="s3://output/",
- region='eu-central-1',
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- thumbnail_suffix=".lightly/thumbnails/[filename]_thumb.[extension]",
- purpose=DatasourcePurpose.LIGHTLY
- )
-
-
-
-Next, we schedule a job which extracts 500000 images with the CORESET strategy which
-selects a visually diverse set of images:
-
-
-.. code-block:: python
-
- # in this example we use a diversifying selection strategy (CORESET)
-
- client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": False,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False
- },
- selection_config = {
- "n_samples": 500000,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- }
- )
-
-
-The complete **processing time** was **04h 37m 02s**. The machine used for this experiment is a cloud instance with
-8 cores, 30GB of RAM, and a V100 GPU. The dataset was stored on S3.
-
-You can also use the direct link for the
-`ImageNet `_ report.
-
-
-
-Combining Cityscapes with Kitti
-================================
-
-The Lightly Worker's datapool feature allows to update the pool of selected images
-whenver new data arrives. This is a common usecase in production systems where new
-image data arrives every week. In this example we simulate this process by first
-selecting a subset of the Cityscapes dataset and then adding images from Kitti.
-
-
-We start by creating a dataset and configuring the datasource. We assume here that we
-have **only the Cityscapes** dataset stored in our S3 bucket under `s3://dataset/kittiscapes`:
-
-.. code-block:: python
-
- from lightly.api import ApiWorkflowClient
- from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
- from lightly.openapi_generated.swagger_client.models.datasource_purpose import DatasourcePurpose
-
- # Create the Lightly client to connect to the API.
- client = ApiWorkflowClient(token="MY_AWESOME_TOKEN")
-
- # Create a new dataset on the Lightly Platform.
- client.create_new_dataset_with_unique_name(
- 'kittiscapes-example',
- DatasetType.IMAGES,
- )
-
- ## AWS S3
- # Input bucket
- client.set_s3_config(
- resource_path="s3://dataset/kittiscapes/",
- region='eu-central-1',
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- purpose=DatasourcePurpose.INPUT
- )
- # Output bucket
- client.set_s3_config(
- resource_path="s3://output/",
- region='eu-central-1',
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- purpose=DatasourcePurpose.LIGHTLY
- )
-
-The following command schedules a job to select a subset from Cityscapes:
-
-.. code-block:: python
-
- # in this example we use a diversifying selection strategy (CORESET)
-
- client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": False,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False
- },
- selection_config = {
- "proportion_samples": 1.0,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY",
- "stopping_condition_minimum_distance": 0.2
- }
- }
- ]
- }
- )
-
-
-The report for running the command can be found here:
-:download:`Cityscapes.pdf <../resources/datapool_example_cityscapes.pdf>`
-
-Since the Cityscapes dataset has subfolders for the different cities Lightly
-worker uses them as weak labels for the embedding plot as shown below.
-
-.. figure:: ../resources/cityscapes_scatter_umap_k_15_no_overlay.png
- :align: center
- :alt: some alt text
-
- Scatterplot of Cityscapes. Each color represents one of the 18
- subfolders (cities) of the Cityscapes dataset.
-
-
-Now we can use the datapool to select the interesting
-frames from Kitti and add them to Cityscapes. For this, first **add all images
-from Kitti to the S3 bucket** and then simply run the same command as above again.
-The Lightly Worker will detect which images have already been processed and only work with
-the new images.
-
-
-.. code-block:: python
-
- client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": False,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False,
- },
- selection_config = {
- "proportion_samples": 1.0,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY",
- "stopping_condition_minimum_distance": 0.2
- }
- }
- ]
- }
- )
-
-
-The dataset from the beginning will now contain images from both datasets and
-new plots have been generated in the report. The plots show
-the embeddings and highlight with blue color the samples which have been added
-from the new dataset. In our experiment, we see that Lightly Worker added several
-new samples outside of the previous embedding distribution. This is great, since it
-shows that Cityscapes and Kitti have different data and we can combine the two datasets.
-
-
-.. figure:: ../resources/datapool_umap_scatter_before_threshold_0.2.png
- :align: center
- :alt: An example of the newly selected examples when we use
- stopping_condition.min_distance=0.2
-
- An example of the newly selected examples when we use
- stopping_condition.min_distance=0.2. 7089 samples from Kitti have been added
- to our existing datapool.
-
-.. figure:: ../resources/datapool_umap_scatter_before_threshold_0.05.png
- :align: center
- :alt: An example of the newly selected examples when we use
- stopping_condition.min_distance=0.05
-
- An example of the newly selected examples when we use
- stopping_condition.min_distance=0.05. 3598 samples from Kitti have been added
- to our existing datapool.
-
-
-The report for running the command can be found here:
-:download:`kitti_with_min_distance=0.2.pdf <../resources/datapool_example_kitti_threshold_0.2.pdf>`
-
-And the report for stopping condition mininum distance of 0.05:
-:download:`kitti_with_min_distance=0.05.pdf <../resources/datapool_example_kitti_threshold_0.05.pdf>`
\ No newline at end of file
diff --git a/docs/source/docker/examples/datasets_in_the_wild.rst b/docs/source/docker/examples/datasets_in_the_wild.rst
deleted file mode 100644
index 90d2b1c3a..000000000
--- a/docs/source/docker/examples/datasets_in_the_wild.rst
+++ /dev/null
@@ -1,233 +0,0 @@
-Extract Diverse Video Frames
-=============================
-
-The following example is a showcase how the Lightly Worker can be used
-to extract frames from a video based on their uniqueness rather than based on timestamps.
-
-.. note:: For all examples we assume that the Lightly Worker is configured and running. See :ref:`docker-setup` for more information.
-
-
-Using ffmpeg
-------------
-
-Using tools such as ffmpeg we can extract frames from a video
-using a simple one-liner like this:
-
-.. code-block:: console
-
- # extract all frames from video.mp4 as .png files and store in frames/ folder
- ffmpeg -i video.mp4 frames/%d.png
-
-ffmpeg allows us to use various flags to choose framerate, crop the images,
-resize the images or set the quality as shown here:
-
-.. code-block:: console
-
- # set framerate to 5 fps
- ffmpeg -i video.mp4 -filter:v "fps=5" frames/%d.png
-
- # resize image to 256x256 pixels
- ffmpeg -i video.mp4 -s 256x256 frames/%d.png
-
- # extract frames as .jpg files
- # high quality jpg compression
- ffmpeg -i video.mp4 -qscale:v 1 frames/%d.jpg
-
- # lower quality jpg compression
- ffmpeg -i video.mp4 -qscale:v 5 frames/%d.jpg
-
- # crop a 480x480 image with 80 pixels offset in x direction
- ffmpeg -i video.mp4 -filter:v "crop=480:480:80:0" frames/%d.png
-
- # and many more ...
-
-However, the problem is the extracted frames sum up and use lots of storage.
-For most training tasks, we don't even want to extract all the frames. Limiting
-the framerate is very easy and helps us reduce the amount of extracted data.
-On the other hand, even a video with 5 fps might contain lots of similar frames
-or even worse, we might miss some frames with lots of "action".
-
-Using the Lightly Worker
-------------------------
-
-The Lightly Worker has been designed to give engineers an alternative to using
-fixed framerates for frame extraction.
-
-How about selecting frames based on their similarity?
-
-In this example, we use the following video: https://www.pexels.com/de-de/video/3719157/
-
-We store the video in a storage bucket, e.g. under *s3://dataset/video/*. We can use wget in
-a terminal under linux or MacOS to download the video and then either upload it via drag and drop
-or with the `aws cli `_.
-
-
-Now, let's extract 99 frames using the Lightly Worker. We start by creating a dataset and configuring the S3 bucket as
-a datasource. We call the dataset `frame-extraction-example` and use the input type `VIDEOS`. We configure the datasource to point at `s3://dataset/video/`.
-
-.. code-block:: python
-
- from lightly.api import ApiWorkflowClient
- from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
- from lightly.openapi_generated.swagger_client.models.datasource_purpose import DatasourcePurpose
-
- # Create the Lightly client to connect to the API.
- client = ApiWorkflowClient(token="MY_AWESOME_TOKEN")
-
- # Create a new dataset on the Lightly Platform.
- client.create_new_dataset_with_unique_name(
- 'frame-extraction-example',
- DatasetType.VIDEOS,
- )
-
- ## AWS S3
- # Input bucket
- client.set_s3_config(
- resource_path="s3://dataset/video/",
- region='eu-central-1',
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- purpose=DatasourcePurpose.INPUT
- )
- # Output bucket
- client.set_s3_config(
- resource_path="s3://output/",
- region='eu-central-1',
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- purpose=DatasourcePurpose.LIGHTLY
- )
-
-
-Next, we schedule a job which extracts 99 frames with a strategy to
-select a diverse set of frames:
-
-
-.. code-block:: python
-
- client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True
- },
- selection_config = {
- "n_samples": 99,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- }
- )
-
-The extracted frames can now be found in the output bucket (`s3://output`) and can easily be accessed from the `Lightly Platform `_.
-
-
-For comparison, we extracted frames from the video using ffmpeg with the following command:
-
-.. code-block:: console
-
- ffmpeg -i raw/video.mp4 -filter:v "fps=5" frames_ffmpeg/%d.png
-
-
-The table below shows a comparison of the different extraction methods:
-
-.. list-table::
- :widths: 50 50 50 50 50
- :header-rows: 1
-
- * - Metric
- - original dataset
- - after ffmpeg
- - after random
- - after coreset
- * - Number of Samples
- - 475
- - 99
- - 99
- - 99
- * - L2 Distance (Mean)
- - 1.2620
- - 1.2793
- - 1.2746
- - 1.3711
- * - L2 Distance (Min)
- - 0.0000
- - 0.0000
- - 0.0586
- - 0.2353
- * - L2 Distance (Max)
- - 1.9835
- - 1.9693
- - 1.9704
- - 1.9470
- * - L2 Distance (10th Percentile)
- - 0.5851
- - 0.5891
- - 0.5994
- - 0.8690
- * - L2 Distance (90th Percentile)
- - 1.8490
- - 1.8526
- - 1.8525
- - 1.7822
-
-
-We notice the following when looking at this table:
-
-- The **min distance** between two samples was 0 after ffmpeg selection whereas the
- min distance significantly increased using coreset selection strategy.
-
- - 0 distance means that there are at least two samples completely identical
- (e.g. two frames in the video are the same)
-
-- The **mean distance** between the original dataset, ffmpeg, as well as
- random selection, is very similar. The coreset selection however differs
- significantly with a higher mean (higher diversity) in the selected dataset.
-
-- The **10th percentile** shows similar behavior to the mean distance.
-
-As you see in this example just selecting every N-th frame is similar to
-selecting frames randomly. More sophisticated selection strategies, such as the coreset selection strategy, result in
-much higher sample diversity. The docker has been optimized for these selection strategies.
-
-
-.. note:: Note that by default the embeddings of the dataset will be normalized
- to unit vector length. Max L2 distance between two vectors is
- therefore 2.0 (two vectors pointing in opposite directions).
-
-
-Now let's take a look at the storage requirements. If we would extract all frames from the video
-and then run a selection algorithm on them we would need 553.4 MBytes. However, the Lightly Worker
-can process the video directly so we require only 6.4 MBytes of storage. This means it requires 70x less storage!
-
-
-.. list-table::
- :widths: 50 50 50 30
- :header-rows: 1
-
- * - Metric
- - ffmpeg extracted frames
- - Lightly Worker using video
- - Reduction
- * - Storage Consumption
- - 447 MBytes + 6.4 MBytes
- - 6.4 MBytes
- - 70.84x
-
-.. note:: Why not extract the frames as compressed .jpg images? Extracting the
- frames as .jpg would indeed reduce storage consumption. The video from
- our example would end up using (14 MBytes + 6.4 MBytes). However, for
- critical applications where robustness and accuracy of the model are
- key, we have to think about the final system in production. Is your
- production system working with the raw camera stream (uncompressed) or
- with compressed frames (e.g. .jpg)? Very often we don’t have time to
- compress a frame in real-time systems or don’t want to introduce
- compression artifacts. You should also think about whether you want
- to train a model on compressed data whereas in production is runs
- using raw data.
diff --git a/docs/source/docker/examples/overview.rst b/docs/source/docker/examples/overview.rst
deleted file mode 100644
index 754408481..000000000
--- a/docs/source/docker/examples/overview.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-Examples
-========
-
-We provide examples of how the Lightly worker can be used on various academic and
-wild datasets.
-
-.. toctree::
- :maxdepth: 1
-
- datasets_in_the_wild.rst
- academic_datasets.rst
\ No newline at end of file
diff --git a/docs/source/docker/getting_started/first_steps.rst b/docs/source/docker/getting_started/first_steps.rst
deleted file mode 100644
index 807f3897f..000000000
--- a/docs/source/docker/getting_started/first_steps.rst
+++ /dev/null
@@ -1,947 +0,0 @@
-.. _docker-first-steps:
-
-First Steps
-===================================
-
-The Lightly Worker follows a train, embed, select workflow:
-
-.. code-block:: console
-
- +--------+ +---------+ +--------+
- | Train +----->+ Embed +----->+ Select |
- +--------+ +---------+ +--------+
-
-#. You can either use a pre-trained model from the model zoo or fine-tune
- a model on your unlabeled dataset using self-supervised learning. The output
- of the train step is a model checkpoint.
-
-#. The embed step creates embeddings of the input dataset. Each sample gets
- represented using a low-dimensional vector. The output of the embed step is
- a .csv file.
-
-#. Finally, based on the embeddings and additional information, such as predictions or
- metadata, we can use one of the selection strategies to pick the relevant data for you.
- The output of the selection is a list of selected input samples as well as analytics in
- the form of a pdf report with plots.
-
-
-The Lightly Worker can be easily triggered from your Python code. There are various parameters you can
-configure and we also expose the full configuration of the Lightly self-supervised learning framework.
-You can use the Lightly Worker to train a self-supervised model instead of using the Lightly\ **SSL** framework.
-
-Using Docker
--------------
-
-We use docker containers to ship the Lightly Worker. Docker allows us to run the
-same worker on various operating systems with different setups.
-
-`To learn more about docker please head over to the official docs! `_
-
-Here, we quickly explain the most important parts of the typical **docker run** command.
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --gpus all --rm -it lightly/worker:latest
-
-- :code:`docker run` this is the command to run/start a container
-- :code:`--shm-size="1024m"` because we do lots of parallel computations we
- want to give our container some extra shared memory for inter process communication
-- :code:`--gpus all` passes all the host GPUs to the container and makes them accessible
-- :code:`--rm` makes sure we remove the container after running
-- :code:`-it` enables interactive mode (user input gets passed to container).
- This allows you to use `ctrl-c` to stop the container
-- :code:`lightly/worker:latest` is the docker image we want to run
-
-
-Start the Lightly Worker Docker
---------------------------------
-
-Before we jump into the details of how to submit jobs, we need to start the
-Lightly Worker docker container in worker mode (as outlined in :ref:`docker-setup`).
-
-
-**This is how you start your Lightly Worker:**
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --gpus all --rm -it \
- lightly/worker:latest \
- token=MY_AWESOME_TOKEN \
- worker.worker_id=MY_WORKER_ID
-
-.. note:: Learn how to obtain your :ref:`ref-authentication-token`.
-
-Now, let's see how this will look in action!
-
-
-.. _worker-creating-a-dataset:
-
-Creating a Dataset
-------------------
-
-To set up inputs and outputs for your job you will need a `dataset_id`. You can either create
-a new dataset from Python or re-use an existing one (see :ref:`datapool`).
-
-
-.. code-block:: python
- :caption: Creating a new dataset from Python
-
- from lightly.api import ApiWorkflowClient
- from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
-
- # Create the Lightly client to connect to the API.
- client = ApiWorkflowClient(token="MY_AWESOME_TOKEN")
-
- # Create a new dataset on the Lightly Platform.
- client.create_dataset(
- 'dataset-name',
- DatasetType.IMAGES # can be DatasetType.VIDEOS when working with videos
- )
- dataset_id = client.dataset_id
-
-
-You can see the dataset under https://app.lightly.ai/datasets
-
-.. _worker-setting-datasource-configs:
-
-Setting the datasource configs
-------------------------------
-
-The Lightly worker reads input data from a cloud storage folder and will upload selection results
-to cloud storage as well. You therefore need to define an `INPUT` and `LIGHTLY` bucket.
-You can re-use the `client` from the previous step. If you create a new `ApiWorkflowClient`
-make sure to specify the `dataset_id` in the constructor.
-
-INPUT bucket
-^^^^^^^^^^^^
-
-The `INPUT` bucket is where the Lightly Worker reads your input data from. You must specify it and you must provide Lightly `LIST` and `READ` access to it.
-
-LIGHTLY bucket
-^^^^^^^^^^^^^^
-
-The `LIGHTLY` bucket must be specified as well and you must provide Lightly `LIST`, `READ` and `WRITE` access to it.
-You can have separate credentials for it or use the same as for the `INPUT` bucket.
-The `LIGHTLY` bucket can point to a different directory in the same bucket or a different bucket (even located at a different cloud storage provider).
-Its `resource_path` must point to an existing directory. This directory must exist, but can be empty.
-The `LIGHTLY` bucket is used for many purposes:
-
-- Saving thumbnails of images for a more responsive Lightly Platform.
-- Saving images of cropped out objects, if you use the object-level workflow. See also :ref:`docker-object-level`.
-- Saving frames of videos, if your input consists of videos.
-- Providing the relevant filenames file if you want to to run the Lightly Worker only on a subset of input files: See also :ref:`specifying_relevant_files`.
-- Providing predictions for running the object level workflow or as additional information for the selection process. See also :ref:`docker-datasource-predictions`.
-- Providing metadata as additional information for the selection process. See also :ref:`docker-datasource-metadata`.
-
-
-.. tabs::
-
- .. tab:: S3
-
- .. code-block:: python
- :caption: Giving access to storage buckets from Python
-
- from lightly.openapi_generated.swagger_client.models.datasource_purpose import DatasourcePurpose
-
- ## AWS S3
- # Input bucket
- client.set_s3_config(
- resource_path="s3://bucket/input/",
- region='eu-central-1',
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- purpose=DatasourcePurpose.INPUT
- )
- # Lightly bucket
- client.set_s3_config(
- resource_path="s3://bucket/lightly/",
- region='eu-central-1',
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- purpose=DatasourcePurpose.LIGHTLY
- )
-
- .. tab:: S3 Delegated Access
-
- .. code-block:: python
- :caption: Giving access to storage buckets from Python
-
- from lightly.openapi_generated.swagger_client.models.datasource_purpose import DatasourcePurpose
-
- ## AWS S3
- # Input bucket
- client.set_s3_delegated_access_config(
- resource_path="s3://bucket/input/",
- region='eu-central-1',
- role_arn='S3-ROLE-ARN',
- external_id='S3-EXTERNAL-ID',
- purpose=DatasourcePurpose.INPUT
- )
- # Lightly bucket
- client.set_s3_delegated_access_config(
- resource_path="s3://bucket/lightly/",
- region='eu-central-1',
- role_arn='S3-ROLE-ARN',
- external_id='S3-EXTERNAL-ID',
- purpose=DatasourcePurpose.LIGHTLY
- )
-
- .. tab:: GCS
-
- .. code-block:: python
- :caption: Giving access to storage buckets from Python
-
- import json
- from lightly.openapi_generated.swagger_client.models.datasource_purpose import DatasourcePurpose
-
- ## Google Cloud Storage
- # Input bucket
- client.set_gcs_config(
- resource_path="gs://bucket/input/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open('credentials_read.json'))),
- purpose=DatasourcePurpose.INPUT
- )
- # Lightly bucket
- client.set_gcs_config(
- resource_path="gs://bucket/lightly/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open('credentials_write.json'))),
- purpose=DatasourcePurpose.LIGHTLY
- )
-
-
- .. tab:: Azure
-
- .. code-block:: python
- :caption: Giving access to storage buckets from Python
-
- from lightly.openapi_generated.swagger_client.models.datasource_purpose import DatasourcePurpose
-
- ## Azure
- # Input bucket
- client.set_azure_config(
- container_name='my-container/input/',
- account_name='ACCOUNT-NAME',
- sas_token='SAS-TOKEN',
- purpose=DatasourcePurpose.INPUT
- )
- # Lightly bucket
- client.set_azure_config(
- container_name='my-container/lightly/',
- account_name='ACCOUNT-NAME',
- sas_token='SAS-TOKEN',
- purpose=DatasourcePurpose.LIGHTLY
- )
-
-.. warning::
- The credentials passed above need to provide Lightly with `LIST` and `READ` access to the `INPUT` bucket and
- with `LIST`, `READ`, and `WRITE` access to the `LIGHTLY` bucket. See :ref:`dataset-creation-gcloud-bucket`,
- :ref:`dataset-creation-aws-bucket`, and :ref:`dataset-creation-azure-storage` for help
- with configuring the different roles.
-
-
-
-.. _worker-scheduling-a-job:
-
-Scheduling a Simple Job
------------------------
-
-Now that everything is in place, let's configure and run a simple job.
-
-.. code-block:: python
- :caption: Scheduling a job from Python
-
- scheduled_run_id = client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- },
- selection_config={
- "n_samples": 50,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- }
- )
-
-
-The command schedules a job with the following configurations:
-
-- :code:`enable_corruptness_check` Checks your dataset for corrupt images if **True**.
-
-- :code:`remove_exact_duplicates` Removes exact duplicates if **True**.
-
-- The :code:`selection_config` will make the Lightly Worker choose 50 samples
- from the initial dataset that are as diverse as possible. This is done using the
- embeddings which are automatically created during the run.
-
-For more details and options regarding the worker config, head to :ref:`docker-configuration`.
-For more details and options regarding the selection config, head to :ref:`worker-selection`.
-
-Monitoring the Compute Worker Run
----------------------------------
-
-The worker should pick up the job after a few seconds and start working on it. The
-status of the current run and scheduled jobs can be seen under https://app.lightly.ai/compute/runs.
-Alternatively, you can also monitor it from Python.
-
-.. code-block:: python
- :caption: Monitoring the compute worker run from Python
-
- """
- You can use this code to track and print the state of the compute worker.
- The loop will end once the compute worker run has finished, was canceled or aborted/failed.
- """
- for run_info in client.compute_worker_run_info_generator(scheduled_run_id=scheduled_run_id):
- print(f"Compute worker run is now in state='{run_info.state}' with message='{run_info.message}'")
-
- if run_info.ended_successfully():
- print("SUCCESS")
- else:
- print("FAILURE")
-
-After the job was processed, the selected data will be accessible in the configured dataset. The
-report can be accessed from the compute worker runs page mentioned just above.
-
-
-.. _training-a-self-supervised-model:
-
-Training a Self-Supervised Model
---------------------------------
-
-Sometimes it may be beneficial to finetune a self-supervised model on your
-dataset before embedding the images. This may be the case when the dataset is
-from a specific domain (e.g. for medical images).
-
-The command below will **train a self-supervised model** for (default: 100)
-epochs on the input images before embedding the images and selecting from them.
-
-
-.. code-block:: python
- :emphasize-lines: 5
- :caption: Scheduling a job with self-supervised training from Python
-
- scheduled_run_id = client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": True
- },
- selection_config={
- "n_samples": 50,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- }
- )
-
-You may not always want to train for exactly 100 epochs with the default settings.
-The Lightly Worker is a wrapper around the Lightly\ **SSL** Python package.
-Hence, for training and embedding the user can access and set all the settings
-known from the Lightly\ **SSL** Python package.
-
-Here are some of the most common parameters for the **lightly_config**
-you might want to change:
-
-- :code:`loader.num_workers` specifies the number of background workers for data processing.
- -1 uses the number of available CPU cores.
-- :code:`trainer.max_epochs` determines the number of epochs your SSL model should be trained for.
-
-
-.. code-block:: python
- :emphasize-lines: 24, 35
- :caption: Setting the Lightly\ **SSL** parameters from Python
-
- scheduled_run_id = client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": True,
- },
- selection_config={
- "n_samples": 50,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- },
- lightly_config={
- 'loader': {
- 'batch_size': 16,
- 'shuffle': True,
- 'num_workers': -1,
- 'drop_last': True
- },
- 'model': {
- 'name': 'resnet-18',
- 'out_dim': 128,
- 'num_ftrs': 32,
- 'width': 1
- },
- 'trainer': {
- 'gpus': 1,
- 'max_epochs': 100,
- 'precision': 32
- },
- 'criterion': {
- 'temperature': 0.5
- },
- 'optimizer': {
- 'lr': 1,
- 'weight_decay': 0.00001
- },
- 'collate': {
- 'input_size': 64,
- 'cj_prob': 0.8,
- 'cj_bright': 0.7,
- 'cj_contrast': 0.7,
- 'cj_sat': 0.7,
- 'cj_hue': 0.2,
- 'min_scale': 0.15,
- 'random_gray_scale': 0.2,
- 'gaussian_blur': 0.5,
- 'kernel_size': 0.1,
- 'vf_prob': 0,
- 'hf_prob': 0.5,
- 'rr_prob': 0
- }
- }
- )
-
-
-**Checkpoints** from your training process will be stored in the Lightly Platform as artifacts.
-You can use such a checkpoint in future worker runs by downloading the checkpoint to
-a `shared directory` and then passing the checkpoint filename to the container.
-
-
-.. code-block:: python
- :caption: Downloading the checkpoint
-
- # wait until the job has finished
- for run_info in client.compute_worker_run_info_generator(scheduled_run_id=scheduled_run_id):
- pass
-
- # download the checkpoint file
- run = client.get_compute_worker_run_from_scheduled(scheduled_run_id=scheduled_run_id)
- client.download_compute_worker_run_checkpoint(run=run, output_path="{SHARED_DIR}/checkpoint.ckpt")
-
-
-.. code-block:: console
- :emphasize-lines: 2
- :caption: Starting the worker with a `shared directory`
-
- docker run --shm-size="1024m" --gpus all --rm -it \
- -v {SHARED_DIR}:/home/shared_dir \
- lightly/worker:latest \
- token=MY_AWESOME_TOKEN \
- worker.worker_id=MY_WORKER_ID
-
-
-.. code-block:: python
- :caption: Scheduling a job with a pre-trained checkpoint
- :emphasize-lines: 6
-
- client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False, # set to True if you want to continue training
- "checkpoint": "checkpoint.ckpt"
- },
- selection_config={
- "n_samples": 50,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- }
- )
-
-
-.. _specifying_relevant_files:
-
-Specifying Relevant Files
--------------------------
-Oftentimes not all files in a bucket are relevant. In that case, it's possible
-to pass a list of filenames to the worker using the `relevant_filenames_file` configuration option.
-It will then only consider the listed filenames and ignore all others. To do so, you can create a text file which
-contains one relevant filename per line and then pass the path to the text file when scheduling the job. This works for videos and images.
-
-.. warning:: The `relevant_filenames_file` is expected to be in the **lightly bucket** as specified above (see :ref:`worker-setting-datasource-configs`). And must always be
- located in a subdirectory called `.lightly`.
-
-For example, let's say you're working with the following file structure in an S3 bucket where
-you are only interested in `image_1.png`, `subdir/image_2.png` and `subdir/image_3.png`
-
-.. code-block:: console
-
- s3://my-input-bucket/
- L image_1.png
- L subdir/
- L image_2.png
- L image_3.png
- L image_40.png
- L image_41.png
- L image_42.png
-
-
-Then you can add a file called `relevant_filenames.txt` to your Lightly bucket with the following content (note: only file paths relative to the bucket are supported! And relative paths cannot include dot notations `./` or `../`)
-
-.. code-block:: text
- :caption: relevant_filenames.txt
-
- image_1.png
- subdir/image_2.png
- subdir/image_3.png
-
-
-It's also possible to specify a prefix by denoting it with an asterisk `*` to include whole folders instead of listing many files individually.
-Everything up until the first `*` of a line will be considered as the prefix.
-
-.. code-block:: text
- :emphasize-lines: 2
- :caption: relevant_filenames.txt
-
- image_1.png
- subdir/*
-
-
-You can also combine the power of the prefix with the `gitignore syntax `_ to exclude certain files again.
-
-.. code-block:: text
- :emphasize-lines: 2
- :caption: relevant_filenames.txt
-
- image_1.png
- subdir/* subdir/image_4* !subdir/image_41.png !subdir/image_42.png
- ^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- prefix gitignore patterns separated by a whitespace
-
-
-In the above example `image_1.png`, `subdir/image_2.png`, `subdir/image_3.png`, `subdir/image_41.png`, `subdir/image_42.png` would be considered, while `subdir/image_40.png` would be ignored.
-
-When using this feature, the Lightly bucket should then look like this:
-
-
-.. code-block:: console
-
- s3://my-Lightly-bucket/
- L .lightly/
- L relevant_filenames.txt
-
-
-The corresponding Python command to submit a job would then be as follows:
-
-.. code-block:: python
- :emphasize-lines: 3
- :caption: Scheduling a job with relevant filenames from Python
-
- client.schedule_compute_worker_run(
- worker_config={
- "relevant_filenames_file": ".lightly/relevant_filenames.txt",
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True
- },
- selection_config={
- "n_samples": 50,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- }
- )
-
-
-
-Artifacts
----------
-Each scheduled job creates a number of artifacts upon execution. These artifacts
-are uploaded to the Lightly Platform and can be accessed with the help of the Python client.
-
-Here's a list of the artifacts generated by the Lightly Worker:
-
-- :ref:`report`
-- :ref:`log_file`
-- :ref:`memory_log_file`
-- :ref:`checkpoint`
-
-
-The following code shows how to download the artifacts of a Lightly Worker run.
-
-.. code-block:: python
- :caption: Download all artifacts
-
- scheduled_run_id = client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- },
- selection_config={
- "n_samples": 50,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- }
- )
-
- # wait until the run is finished
- for run_info in client.compute_worker_run_info_generator(scheduled_run_id=scheduled_run_id):
- pass
-
- # download all artifacts to "my_run/artifacts"
- run = client.get_compute_worker_run_from_scheduled(scheduled_run_id=scheduled_run_id)
- client.download_compute_worker_run_artifacts(run=run, output_dir="my_run/artifacts")
-
-
-It's also possible to get the artifacts by only knowing the dataset id:
-
-.. code-block:: python
- :caption: Download all artifacts by dataset id
-
- # get all runs for a given dataset sorted from old to new
- runs = client.get_compute_worker_runs(dataset_id=client.dataset_id)
- run = runs[-1] # get the latest run
-
- # download all artifacts to "my_run/artifacts"
- run = client.get_compute_worker_run_from_scheduled(scheduled_run_id=scheduled_run_id)
- client.download_compute_worker_run_artifacts(run=run, output_dir="my_run/artifacts")
-
-
-.. note::
- The above examples show how to download all artifacts at once. It's also possible to
- download each artifact on its own. To see how, please refer to the individual sections
- below.
-
-
-.. _report:
-
-Report
-^^^^^^
-
-
-
-To facilitate sustainability and reproducibility in ML, the Lightly worker
-has an integrated reporting component which provides plots, statistics, and more information collected
-during the various processing steps. For example, there is information about the corruptness check, embedding process and selection process.
-
-To make it easier for you to understand and discuss the dataset we put the essential information into
-an automatically generated PDF report.
-Sample reports can be found on the `Lightly website `_.
-
-The report is also available as a report.json file. Any value from the pdf report can be easily be accessed.
-
-.. code-block:: python
- :caption: Download the report
-
- # download the report as .pdf
- run = client.get_compute_worker_run_from_scheduled(scheduled_run_id=scheduled_run_id)
- client.download_compute_worker_run_report_pdf(run=run, output_path="my_run/artifacts/report.pdf")
-
- # download the report as .json
- run = client.get_compute_worker_run_from_scheduled(scheduled_run_id=scheduled_run_id)
- client.download_compute_worker_run_report_json(run=run, output_path="my_run/artifacts/report.json")
-
-
-**Histograms and Plots**
-
-The report contains histograms of the pairwise distance between images before and after the selection process.
-
-An example of such a histogram before and after filtering for the CamVid dataset consisting of 367
-samples is shown below. We marked the region which is of special interest with an orange rectangle.
-Our goal is to make this histogram more symmetric by removing samples of short distances from each other.
-
-If we remove 25 samples (7%) out of the 367 samples of the CamVid dataset the histogram looks more symmetric
-as shown below. In our experiments, removing 7% of the dataset results in a model with higher validation set accuracy.
-
-.. image:: images/histogram_before_after.jpg
-
-.. note::
-
- Why symmetric histograms are preferred: An asymmetric histogram can be the result of either a dataset with outliers or inliers.
- A heavy tail for low distances means that there is at least one high-density region with many samples very close to each other within the main cluster.
- Having such a high-density region can lead to biased models trained on this particular dataset. A heavy tail towards high distances shows that there is
- at least one high-density region outside the main cluster of samples.
-
-
-.. _log_file:
-
-Log File
-^^^^^^^^
-A file containing useful log messages for debugging. In case your job does not get
-processed properly and an error occured this file contains more detailed information
-about what went wrong.
-
-.. code-block:: python
- :caption: Download the log file
-
- # download the log file
- run = client.get_compute_worker_run_from_scheduled(scheduled_run_id=scheduled_run_id)
- client.download_compute_worker_run_log(run=run, output_path="my_run/artifacts/log.txt")
-
-
-.. _memory_log_file:
-
-Memory Log File
-^^^^^^^^^^^^^^^
-This file contains relevant information about the memory consumption of the Lightly Worker.
-
-.. code-block:: python
- :caption: Download the memory log file
-
- # download the log file
- run = client.get_compute_worker_run_from_scheduled(scheduled_run_id=scheduled_run_id)
- client.download_compute_worker_run_memory_log(run=run, output_path="my_run/artifacts/memlog.txt")
-
-
-.. _checkpoint:
-
-Checkpoint
-^^^^^^^^^^
-Checkpoint with the trained model weights (exists only if `enable_training=True`).
-See :ref:`load-model-from-checkpoint` on how to use the checkpoint file.
-
-.. note::
- The checkpoint file is only available if the Lightly Worker was run in training mode!
- For details, see :ref:`training-a-self-supervised-model`
-
-.. code-block:: python
- :caption: Download the checkpoint
-
- # download the checkpoint file
- run = client.get_compute_worker_run_from_scheduled(scheduled_run_id=scheduled_run_id)
- client.download_compute_worker_run_checkpoint(run=run, output_path="my_run/artifacts/checkpoint.ckpt")
-
-
-
-Other Outputs
--------------
-
-The Lightly Worker produces a variety of different files which can be used for debugging or further
-processing of the selected images. To access *all* of the generated files, it's necessary to mount
-a local volume to the docker container when starting the Lightly Worker.
-
-Don't forget to also remove the curly brakets :code:`{ }` when replacing
-:code:`{OUTPUT_DIR}` with the path where you want to have the output directory.
-
-
-.. code-block:: console
- :emphasize-lines: 2
- :caption: Starting the worker with an `output directory`
-
- docker run --shm-size="1024m" --gpus all --rm -it \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MY_AWESOME_TOKEN \
- worker.worker_id=MY_WORKER_ID
-
-
-.. warning:: Docker volume or port mappings always follow the scheme that you first
- specify the host systems port followed by the internal port of the
- container. E.g. **-v /outputs:/home/outputs** would mount /outputs
- from your system to /home/outputs in the docker container.
-
-
-The output directory is structured in the following way:
-
-* config:
- A directory containing copies of the configuration files and overwrites.
-* data:
- The data directory contains everything to do with data.
- * `embeddings.csv` contains the computed embeddings for all input samples used in selection (including datapool samples, but excluding corrupt or duplicate samples).
- * `selected_embeddings_including_datapool.csv` contains the embeddings of all selected samples (including preselected datapool samples).
- * If `enable_corruptness_check=True`, `data` will contain a filtered dataset. It will only contain samples whose files exist, are accessible, are of the type specified by their extension and don't have any artefacts.
- * If `selected_sequence_length > 1`, `data` will contain a `sequence_information.json`
- file with information about the selected sequences (filenames, video frame timestamps, ...).
- Head to :ref:`sequence-selection` for more details on sequence selection.
-* log.txt
- A file containing useful log messages for debugging. In case your job does not get
- processed properly and an error occured this file contains more detailed information
- about what went wrong.
-* filenames:
- This directory contains lists of filenames of the corrupt images, removed images, selected
- images and the images which were removed because they have an exact duplicate in the dataset.
-* lightly_epoch_X.ckpt
- Checkpoint with the trained model weights (exists only if `enable_training=True`).
- See :ref:`load-model-from-checkpoint` on how to use the checkpoint file.
-* plots:
- A directory containing the plots which were produced for the report.
-* report.pdf
- To provide a simple overview of the filtering process the Lightly worker automatically generates a report.
- The report contains
- * information about the job (duration, processed files etc.)
- * estimated savings in terms of labeling costs and CO2 due to the smaller dataset
- * statistics about the dataset before and after the selection process
- * histogram before and after filtering
- * visualizations of the dataset
- * nearest neighbors of retained images among the removed ones
-* The report is also available as a report.json file. Any value from the pdf report can be easily be accessed.
-
-
-Below you find a typical output folder structure.
-
-
-.. code-block:: console
-
- |-- config
- | |-- config.yaml
- | |-- hydra.yaml
- | '-- overrides.yaml
- |-- data
- | |-- al_score_embeddings.csv
- | |-- bounding_boxes.json
- | |-- bounding_boxes_examples
- | |-- embeddings.csv
- | |-- normalized_embeddings.csv
- | |-- sampled
- | |-- selected_embeddings.csv
- | '-- sequence_information.json
- |-- filenames
- | |-- corrupt_filenames.txt
- | |-- duplicate_filenames.txt
- | |-- removed_filenames.txt
- | '-- sampled_filenames_excluding_datapool.txt
- |-- lightly_epoch_X.ckpt
- |-- plots
- | |-- distance_distr_after.png
- | |-- distance_distr_before.png
- | |-- filter_decision_0.png
- | |-- filter_decision_11.png
- | |-- filter_decision_22.png
- | |-- filter_decision_33.png
- | |-- filter_decision_44.png
- | |-- filter_decision_55.png
- | |-- pretagging_histogram_after.png
- | |-- pretagging_histogram_before.png
- | |-- scatter_pca.png
- | |-- scatter_pca_no_overlay.png
- | |-- scatter_umap_k_15.png
- | |-- scatter_umap_k_15_no_overlay.png
- | |-- scatter_umap_k_5.png
- | |-- scatter_umap_k_50.png
- | |-- scatter_umap_k_50_no_overlay.png
- | '-- scatter_umap_k_5_no_overlay.png
- |-- report.json
- '-- report.pdf
-
-
-
-Downloading
------------
-
-After a job has successfully run, a dataset with the selected samples
-and a tag with the name `initial-tag` are created. From there you can easily
-export and download the filenames for further processing:
-
-.. code-block:: python
- :caption: Download the filenames for further processing
-
- from lightly.api.api_workflow_client import ApiWorkflowClient
-
- client = ApiWorkflowClient(token='MY_AWESOME_TOKEN', dataset_id='xyz') # replace this with your token
- filenames = client.export_filenames_by_tag_name(
- 'initial-tag' # name of the datasets tag
- )
- with open('filenames-of-initial-tag.txt', 'w') as f:
- f.write(filenames)
-
-
-We also support multiple `additional export formats `_
-with which you can e.g. export to Label Studio or Label Box.
-
-It is also possible to directly download the actual files themselves as follows:
-
-.. code-block:: python
- :caption: Directly download the files
-
- from lightly.api.api_workflow_client import ApiWorkflowClient
-
- client = ApiWorkflowClient(token='MY_AWESOME_TOKEN', dataset_id='xyz') # replace this with your token
- client.download_dataset(
- './my/output/path/', # path to where the files should be saved
- 'initial-tag' # name of the datasets tag
- )
-
-
-Sharing Datasets
-----------------
-
-Once a dataset has been created we can also make it accessible to other users by
-sharing it. Sharing works through e-mail addresses.
-
-.. code-block:: python
- :caption: Share a dataset
-
- # we first need to have an api client (create a new or use an existing one)
- client = ApiWorkflowClient(token="MY_AWESOME_TOKEN")
-
- # share a dataset with a user
- client.share_dataset_only_with(dataset_id="MY_DATASET_ID", user_emails=["user@something.com"])
-
- # share dataset with a user while keep sharing it with previous users
- user_emails = client.get_shared_users(dataset_id="MY_DATASET_ID")
- user_emails.append("additional_user2@something.com")
- client.share_dataset_only_with(dataset_id="MY_DATASET_ID", user_emails=user_emails)
-
- # revoke access to all users
- client.share_dataset_only_with(dataset_id="MY_DATASET_ID", user_emails=[])
-
-
-If you want to get a list of users that have access to a given dataset we can do
-this using the following code:
-
-.. code-block:: python
- :caption: Share a dataset
-
- # we first need to have an api client (create a new or use an existing one)
- client = ApiWorkflowClient(token="MY_AWESOME_TOKEN")
-
- # get a list of users that have access to a given dataset
- client.get_shared_users(dataset_id="MY_DATASET_ID")
- print(users)
- # ["user@something.com"]
-
-
-.. note::
-
- You can share a dataset immediately after creating the dataset. You don't have
- to wait for a Lightly Worker run to complete!
diff --git a/docs/source/docker/getting_started/hardware_recommendations.rst b/docs/source/docker/getting_started/hardware_recommendations.rst
deleted file mode 100644
index a5d58b05b..000000000
--- a/docs/source/docker/getting_started/hardware_recommendations.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-.. _hardware-recommendations:
-
-Hardware recommendations
-========================
-
-The Lightly Worker is usually run on dedicated hardware
-or in the cloud on a compute instance
-which is specifically spun up to run Lightly Worker standalone.
-Our recommendations on the hardware requirements of this compute instance are
-based on three criteria:
-
-- speed: The worker should process your dataset as quickly a possible.
-- cost-effectiveness: The compute instance should be economical.
-- stability: The worker should not crash because it runs out of memory.
-
-Depending on your dataset size, we recommend the following machine:
-
-- Up to 100.000 images or video frames: Use the AWS EC2 instance `g4dn.xlarge` or similar
- with 4 vCPUs, 16GB of system memory, one T4 GPU
-- Up to 1 Million images or video frames: Use the AWS EC2 instance `g4dn.2xlarge` or similar
- with 8 vCPUs, 32GB of system memory, one T4 GPU
-- More than 1 Million images or video frames: Use the AWS EC2 instance `g4dn.4xlarge` or similar
- with 16 vCPUs, 64GB of system memory, one T4 GPU
-
-You can compute the number of frames of your videos with their length and fps.
-E.g. 100 videos with 600s length each and 30 fps have 100 * 600 * 30 = 1.8 Mio frames.
-
-If you want to train an embedding model for many epochs or want to further increase computing speed,
-we recommend to switch to a V100 or A10 GPU or better.
-
-If you stream the data from a cloud bucket using the datasource feature, make sure that
-the cloud bucket is in the same region as the compute machine.
-Using the same region is very important, see also :ref:`ref-docker-network-traffic-same-region`.
-If you are using the old workflow of reading from a local disk instead, use a SSD.
-However, we recommend the workflow to stream from a cloud bucket.
-
-
-Keep the configuration option `lightly.loader.num_workers` at the default (-1),
-which will set it to the number of vCPUs on your machine.
-
-Finding the compute speed bottleneck
-------------------------------------
-
-Usually, the compute speed is limited by one of three potential bottlenecks.
-Different steps of the Lightly Worker use these resources to a different extent.
-Thus the bottleneck changes throughout the run. The bottlenecks are:
-
-- data read speed: I/O
-- CPU
-- GPU
-
-
-The GPU is used during three steps:
-
-- training an embedding model (optional step)
-- pretagging your dataset (optional step)
-- embedding your dataset
-
-The I/O and CPUs are used during the previous 3 steps and also used during the other steps that may take longer:
-
-- initializing the dataset
-- corruptness check (optional step)
-- dataset dumping & upload (optional step)
-
-Before changing the hardware configuration of your compute instance,
-we recommend to first determine the bottleneck by monitoring it:
-
-- You can find out the current disk usage of your machine using the terminal command `iotop`.
-- If you use a datasource, see the current ethernet usage using the terminal command `ifstat`.
-- You can find out the current CPU and RAM usage of your machine using the terminal commands `top` or `htop`.
-- You can find out the current GPU usage (both compute and VRAM) using the terminal command `watch nvidia-smi`.
-- Note that you might need to install these commands using your package manager.
-
-
-Additional to using these tools, you can also compare the relative duration of the different steps to see the bottleneck.
-E.g. if the embedding step takes much longer than the corruptness check, then the GPU is the bottleneck.
-Otherwise, it is the I/O or CPU.
-
-Updating the machine
---------------------
-
-When updating the machine, we recommend updating the resource that causes the
-bottleneck. After that, the bottleneck might have changed.
-
-If there is not one obvious bottleneck, we recommend to scale up I/O, CPUs and GPU together.
-
-To prevent the worker running out of system memory or GPU memory, we recommend
-about 4GB of RAM and 2GB ov VRAM for each vCPU.
diff --git a/docs/source/docker/getting_started/images/colab_embeddings_example.png b/docs/source/docker/getting_started/images/colab_embeddings_example.png
deleted file mode 100644
index 8c1c14e74..000000000
Binary files a/docs/source/docker/getting_started/images/colab_embeddings_example.png and /dev/null differ
diff --git a/docs/source/docker/getting_started/images/docker_runs_overview.png b/docs/source/docker/getting_started/images/docker_runs_overview.png
deleted file mode 100644
index 5b678596d..000000000
Binary files a/docs/source/docker/getting_started/images/docker_runs_overview.png and /dev/null differ
diff --git a/docs/source/docker/getting_started/images/docker_workers_overview_empty.png b/docs/source/docker/getting_started/images/docker_workers_overview_empty.png
deleted file mode 100644
index 74ac820eb..000000000
Binary files a/docs/source/docker/getting_started/images/docker_workers_overview_empty.png and /dev/null differ
diff --git a/docs/source/docker/getting_started/images/docker_workers_overview_registered.png b/docs/source/docker/getting_started/images/docker_workers_overview_registered.png
deleted file mode 100644
index f7481720e..000000000
Binary files a/docs/source/docker/getting_started/images/docker_workers_overview_registered.png and /dev/null differ
diff --git a/docs/source/docker/getting_started/images/histogram_before_after.jpg b/docs/source/docker/getting_started/images/histogram_before_after.jpg
deleted file mode 100644
index 4083dff22..000000000
Binary files a/docs/source/docker/getting_started/images/histogram_before_after.jpg and /dev/null differ
diff --git a/docs/source/docker/getting_started/selection.rst b/docs/source/docker/getting_started/selection.rst
deleted file mode 100644
index 12bc742ff..000000000
--- a/docs/source/docker/getting_started/selection.rst
+++ /dev/null
@@ -1,569 +0,0 @@
-.. _worker-selection:
-
-Selection
-=========
-
-The Lightly Worker allows you to specify the subset to be selected based on several objectives.
-
-E.g. you can specify that the images in the subset should be visually diverse, be images the model struggles with (active learning),
-should only be sharp images, or have a certain distribution of classes, e.g. be 50% from sunny, 30% from cloudy and 20% from rainy weather.
-
-Each of these objectives is defined by a `strategy`. A strategy consists of two parts:
-
-- The :code:`input` defines which data the objective is defined on. This data is either a scalar number or a vector for each sample in the dataset.
-- The :code:`strategy` itself defines the objective to apply on the input data.
-
-The Lightly Worker allows you to specify several objectives at the same time. The algorithms try to fulfil all objectives simultaneously.
-
-Lightly Worker's data selection algorithms support four types of input:
-
-- **Embeddings** computed using `our open source framework for self-supervised learning `_
-- **Lightly metadata** are metadata of images like the sharpness and computed out of the images themselves by the Lightly Worker.
-- (Optional) :ref:`Model predictions ` such as classifications, object detections or segmentations
-- (Optional) :ref:`Custom metadata ` can be anything you can encode in a json file (from numbers to categorical strings)
-
-Prerequisites
--------------
-
-In order to use the selection feature, you need to
-
-- Start the Lightly Worker in worker mode. See :ref:`worker-register`.
-
-- Set up a dataset in the Lightly Platform with a cloud storage as datasource. See :ref:`worker-creating-a-dataset`
-
-Scheduling a Lightly Worker run with selection
-----------------------------------------------
-
-For scheduling a Lightly Worker run with a specific selection,
-you can use the python client and its :py:meth:`schedule_compute_worker_run ` method.
-You specify the selection with the :code:`selection_config` argument.
-See :ref:`worker-scheduling-a-job` for reference.
-
-Here is an example for scheduling a Lightly worker run with a specific selection configuration:
-
-.. literalinclude:: ../integration/examples/trigger_job.py
-
-
-
-Selection Configuration
------------------------
-
-The configuration of a selection needs to specify both the maximum number of samples to select and the strategies:
-
-.. code-block:: python
-
- {
- "n_samples": 50,
- "proportion_samples": 0.1
- "strategies": [
- {
- "input": {
- "type": ...
- },
- "strategy": {
- "type": ...
- }
- },
- ... more strategies
- ]
- }
-
-The variable :code:`n_samples` must be a positive integer specifying the absolute number of samples which should be selected.
-Alternatively to :code:`n_samples`, you can also set :code:`proportion_samples` to set the number of samples to be selected relative to the input dataset size.
-E.g. set it to `0.1` to select 10% of all samples.
-Please set either one or the other. Setting both or none of them will cause an error.
-
-Each strategy is specified by a :code:`dictionary`, which is always made up of an :code:`input` and the actual :code:`strategy`.
-
-.. code-block:: python
-
- {
- "input": {
- "type": ...
- },
- "strategy": {
- "type": ...
- }
- },
-
-
-Selection Input
-^^^^^^^^^^^^^^^^
-
-The input can be one of the following:
-
-.. tabs::
-
- .. tab:: EMBEDDINGS
-
- The `Lightly OSS framework for self supervised learning `_ is used to compute the embeddings.
- They are a vector of numbers for each sample.
-
- You can define embeddings as input using:
-
- .. code-block:: python
-
- "input": {
- "type": "EMBEDDINGS"
- }
-
- You can also use embeddings from other datasets to create strategies such as
- similarity search:
-
- .. code-block:: python
-
- "input": {
- "type": "EMBEDDINGS",
- "dataset_id": "DATASET_ID_OF_THE_QUERY_IMAGES",
- "tag_name": "TAG_NAME_OF_THE_QUERY_IMAGES" # e.g. "initial-tag"
- },
-
- .. tab:: SCORES
-
- They are a scalar number for each element. They are **specified by the prediction task and the scorer**:
-
- .. code-block:: python
-
- # using your own predictions
- "input": {
- "type": "SCORES",
- "task": "YOUR_TASK_NAME",
- "score": "uncertainty_entropy"
- }
-
- # using the lightly pretagging model
- "input": {
- "type": "SCORES",
- "task": "lightly_pretagging",
- "score": "uncertainty_entropy"
- }
-
- You can specify one of the tasks you specified in your datasource, see :ref:`docker-datasource-predictions` for reference.
- Alternatively, set the task to **lightly_pretagging** to use object detections created by the Lightly Worker itself.
- See :ref:`docker-pretagging` for reference.
-
-
- .. tab:: PREDICTIONS
-
- .. _worker-selection-predictions:
-
- The class distribution probability vector of predictions can be used as well. Here, three cases have to be distinguished:
-
- - **Image Classification**: The probability vector of each sample's prediction is used directly.
-
- - **Object Detection**: The probability vectors of the class predictions of all objects in an image are summed up.
-
- - **Object Detection** and using the :ref:`docker-object-level`: Each sample is a cropped object and has a single object prediction, whose probability vector is used.
-
- This input is **specified using the prediction task**. Furthermore, it should be remembered, which class names are used for this task, as they are needed in later steps.
-
- If you use your own predictions (see :ref:`docker-datasource-predictions`), the task name and class names are taken from the specification in the prediction `schema.json`.
-
- Alternatively, set the task to **lightly_pretagging** to use object detections created by the Lightly Worker itself.
- Its class names are specified here: :ref:`docker-pretagging`.
-
-
- .. code-block:: python
-
- # using your own predictions
- "input": {
- "type": "PREDICTIONS",
- "task": "my_object_detection_task",
- "name": "CLASS_DISTRIBUTION"
- }
-
- # using the lightly pretagging model
- "input": {
- "type": "PREDICTIONS",
- "task": "lightly_pretagging",
- "name": "CLASS_DISTRIBUTION"
- }
-
- .. tab:: METADATA
-
- Metadata is specified by the metadata key. It can be divided across two dimensions:
-
- - **Custom Metadata** vs. **Lightly Metadata**
-
- **Custom Metadata** must be specified when creating a datasource and you must have uploaded metadata to it.
- See :ref:`docker-datasource-metadata` for reference. An example configuration:
-
- .. code-block:: python
-
- "input": {
- "type": "METADATA",
- "key": "weather.temperature"
- }
-
- Use as key the “path” you specified when creating the metadata in the datasource.
-
-
- **Lightly Metadata**, is calculated by the Lightly Worker. It is specified by prepending :code:`lightly` to the key.
- An example configuration:
-
- .. code-block:: python
-
- "input": {
- "type": "METADATA",
- "key": "lightly.sharpness"
- }
-
- Currently supported metadata are :code:`sharpness`, :code:`snr` (signal-to-noise-ratio) and :code:`sizeInBytes`.
- If your use case would profit from more metadata computed out of the image, please reach out to us.
-
- - **Numerical** vs. **Categorical** values
-
- Not all metadata types can be used in all selection strategies. The Lightly Worker differentiates between numerical and categorical metadata.
-
- **Numerical** metadata are numbers (int, float), e.g. `lightly.sharpness` or `weather.temperature`. It is usually real-valued.
-
- **Categorical** metadata is from a discrete number of categories, e.g. `video.location_id` or `weather.description`.
- It can be either an integer or a string.
-
-
-Selection Strategy
-^^^^^^^^^^^^^^^^^^^
-
-There are several types of selection strategies, all trying to reach different objectives.
-
-.. tabs::
-
- .. tab:: DIVERSITY
-
- Use this strategy to **select samples such that they are as different as possible from each other**.
-
- Can be used with **EMBEDDINGS**.
- Samples with a high distance between their embeddings are
- considered to be more *different* from each other than samples with a
- low distance. The strategy is specified like this:
-
- .. code-block:: python
-
- "strategy": {
- "type": "DIVERSITY"
- }
-
- If you want to preserve a minimum distance between chosen samples, you
- can specify it as an additional stopping condition. The selection process
- will stop as soon as one of the stopping criteria has been reached.
-
- .. code-block:: python
- :emphasize-lines: 3
-
- "strategy": {
- "type": "DIVERSITY",
- "stopping_condition_minimum_distance": 0.2
- }
-
- Setting :code:`"stopping_condition_minimum_distance": 0.2` will remove all samples which are
- closer to each other than 0.2.
- This allows you to specify the minimum allowed distance between two images in the output dataset.
- If you use embeddings as input, this value should be between 0 and 2.0, as the embeddings are normalized to unit length.
- This is often a convenient method when working with different data sources and trying to combine them in a balanced way.
- If you want to use this stopping condition to stop the selection early,
- make sure that you allow selecting enough samples by setting :code:`n_samples` or :code:`proportion_samples` high enough.
-
- .. note:: Higher minimum distance in the embedding space results in more
- diverse images being selected. Furthermore, increasing the
- minimum distance will result in fewer samples being selected.
-
- .. tab:: WEIGHTS
-
- The objective of this strategy is to **select samples that have a high numerical value**.
-
- Can be used with **SCORES** and **numerical METADATA**. It can be specified with:
-
- .. code-block:: python
-
- "strategy": {
- "type": "WEIGHTS"
- }
-
- .. tab:: THRESHOLD
-
- The objective of this strategy is to only **select samples that have a numerical value fulfilling a threshold criterion**.
- E.g. they should be bigger or smaller than a certain value.
-
- Can be used with **SCORES** and **numerical METADATA**. It is specified as follows:
-
- .. code-block:: python
-
- "strategy": {
- "type": "THRESHOLD",
- "threshold": 20,
- "operation": "BIGGER_EQUAL"
- }
-
- This will keep all samples whose value (specified by the input) is >= 20 and remove all others.
- The allowed operations are :code:`SMALLER`, :code:`SMALLER_EQUAL`, :code:`BIGGER`, :code:`BIGGER_EQUAL`.
-
- .. tab:: BALANCE
-
- The objective of this strategy is to **select samples such that the distribution of classes in them is as close to a target distribution as possible**.
-
- E.g. the samples chosen should have 50% sunny and 50% rainy weather.
- Or, the objects of the samples chosen should be 40% ambulance and 60% buses.
-
- Can be used with **PREDICTIONS** and **categorical METADATA**.
-
- .. code-block:: python
-
- "strategy": {
- "type": "BALANCE",
- "target": {
- "Ambulance": 0.4, # `Ambulance` should be a valid class in your `schema.json`
- "Bus": 0.6
- }
- }
-
- If the values of the target do not sum up to 1, the remainder is assumed to be the target for the other classes.
- For example, if we would set the target to 20% ambulance and 40% bus, there is the implicit assumption, that the remaining 40% should come from any other class,
- e.g. from cars, bicycles or pedestrians.
-
- Note that not specified classes do not influence the selection process!
-
- .. tab:: SIMILARITY
-
- With this strategy you can use the input emebeddings from another dataset
- to **select similar images**. This can be useful if you are looking for more
- examples of certain edge cases.
-
- Can be used with **EMBEDDINGS**.
-
- .. code-block:: python
-
- "strategy": {
- "type": "SIMILARITY",
- }
-
-
-Configuration Examples
-----------------------
-
-Here are examples for the full configuration including the input for several objectives:
-
-.. dropdown:: Visual Diversity (CORESET)
-
- Choosing 100 samples that are visually diverse equals diversifying samples based on their embeddings:
-
- .. code-block:: python
-
- {
- "n_samples": 100, # set to the number of samples you want to select
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- }
-
-
-.. dropdown:: Active Learning
-
- Active Learning equals weighting samples based on active learning scores:
-
- .. code-block:: python
-
- {
- "n_samples": 100, # set to the number of samples you want to select
- "strategies": [
- {
- "input": {
- "type": "SCORES",
- "task": "my_object_detection_task", # change to your task
- "score": "uncertainty_entropy" # change to your preferred score
- },
- "strategy": {
- "type": "WEIGHTS"
- }
- }
- ]
- }
-
- .. note:: This works as well for Image Classifciation or Segmentation!
-
-.. dropdown:: Visual Diversity and Active Learning (CORAL)
-
- For combining two strategies, just specify both of them:
-
- .. code-block:: python
-
- {
- "n_samples": 100, # set to the number of samples you want to select
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- },
- {
- "input": {
- "type": "SCORES",
- "task": "my_object_detection_task", # change to your task
- "score": "uncertainty_entropy" # change to your preferred score
- },
- "strategy": {
- "type": "WEIGHTS"
- }
- }
- ]
- }
-
-.. dropdown:: Metadata Thresholding
-
- This can be used to remove e.g. blurry images, which equals selecting
- samples whose sharpness is over a threshold:
-
- .. code-block:: python
-
- {
- "n_samples": 100, # set to the number of samples you want to select
- "strategies": [
- {
- "input": {
- "type": "METADATA",
- "key": "lightly.sharpness"
- },
- "strategy": {
- "type": "THRESHOLD",
- "threshold": 20,
- "operation": "BIGGER"
- }
- }
- ]
- }
-
-.. dropdown:: Object Balancing
-
- Use lightly pretagging to get the objects, then specify a target distribution of classes:
-
- .. code-block:: python
-
- {
- "n_samples": 100, # set to the number of samples you want to select
- "strategies": [
- {
- "input": {
- "type": "PREDICTIONS",
- "task": "lightly_pretagging", # (optional) change to your task
- "name": "CLASS_DISTRIBUTION"
- },
- "strategy": {
- "type": "BALANCE",
- "target": {
- "car": 0.1,
- "bicycle": 0.5,
- "bus": 0.1,
- "motorcycle": 0.1,
- "person": 0.1,
- "train": 0.05,
- "truck": 0.05
- }
- }
- }
- ]
- }
-
- .. note:: To use the `lightly pretagging` you need to enable it using :code:`'pretagging': True` in the
- worker config. See :ref:`docker-pretagging` for reference.
-
-.. dropdown:: Metadata Balancing
-
- Let’s assume you have specified metadata with the path `weather.description`
- and want your selected subset to have 20% sunny, 40% cloudy and the rest other images:
-
- .. code-block:: python
-
- {
- "n_samples": 100, # set to the number of samples you want to select
- "strategies": [
- {
- "input": {
- "type": "METADATA",
- "key": "weather.description"
- },
- "strategy": {
- "type": "BALANCE",
- "target": {
- "sunny": 0.2,
- "cloudy": 0.4
- }
- }
- }
- ]
- }
-
-.. dropdown:: Similarity Search
-
- To perform simlarity search you need a dataset and tag
- consisting of the query images.
-
- We can then use the following configuration to find similar images from the
- input dataset. This example will select 100 images from the input dataset that
- are the most similar to the images in the tag from the query dataset.
-
- .. code-block:: python
-
- {
- "n_samples": 100, # put your number here
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS",
- "dataset_id": "DATASET_ID_OF_THE_QUERY_IMAGES",
- "tag_name": "TAG_NAME_OF_THE_QUERY_IMAGES" # e.g. "initial-tag"
- },
- "strategy": {
- "type": "SIMILARITY",
- }
- }
- ]
- }
-
-Application of Strategies
--------------------------
-
-Generally, the order in which the different strategies were defined in the config does not matter.
-In a first step, all the thresholding strategies are applied.
-In the next step, all other strategies are applied in parallel.
-
-.. note:: Note that different taskes can also be combined. E.g. you can use predictions
- from "my_weather_classification_task" for one strategy combined with predictions from
- "my_object_detection_task" from another strategy.
-
-The Lightly Worker optimizer tries to fulfil all strategies as good as possible.
-**Potential reasons why your objectives were not satisfied:**
-
-- **Tradeoff between different objectives.**
- The optimizer always has to tradeoff between different objectives.
- E.g. it may happen that all samples with high WEIGHTS are close together. If you also specified the objective DIVERSITY, then only a few of these high-weight samples
- may be chosen. Instead, also other samples that are more diverse, but have lower weights, are chosen.
-
-- **Restrictions in the input dataset.**
- This applies especially for BALANCE: E.g. if there are only 10 images of ambulances in the input dataset and a total
- of 1000 images are selected, the output can only have a maximum of 1% ambulances. Thus a BALANCE target of having 20% ambulances cannot be fulfilled.
-
-- **Too little samples to choose.**
- If the selection algorithm can only choose a small number of samples, it may not be possible to fulfil the objectives.
- You can solve this by increasing :code:`n_samples` or :code:`proportion_samples`.
-
-Selection on object level
--------------------------
-
-The Lightly Worker supports doing selection on :ref:`docker-object-level`.
-
-While embeddings are fully available, there are some limitations regarding the usage of METADATA and predictions for SCORES and PREDICTIONS as input:
-
-- When using the object level workflow, the object detections used to create the object crops out of the images are available and can be used for both the SCORES and PREDICTIONS input. However, predictions from other tasks are NOT available at the moment.
-
-- The Lightly Worker generates metadata on the fly for the object crops and can thus be used for selection. However, other metadata is on image level and thus NOT available at the moment.
-
-If your use case would profit from using image-level data for object-level selection, please reach out to us.
diff --git a/docs/source/docker/getting_started/setup.rst b/docs/source/docker/getting_started/setup.rst
deleted file mode 100644
index aa2e8e262..000000000
--- a/docs/source/docker/getting_started/setup.rst
+++ /dev/null
@@ -1,221 +0,0 @@
-.. _docker-setup:
-
-Setup
-=====
-
-
-Analytics
-^^^^^^^^^
-
-The Lightly Worker currently reports usage metrics to our analytics software
-(we use mixpanel) which uses https encrypted GET and POST requests to https://api.mixpanel.com.
-The transmitted data includes information about crashes and the number of samples
-that have been filtered. However, **the data does not include input / output samples**,
-filenames, or any other information which can be sensitive to our customers.
-
-
-
-Licensing
-^^^^^^^^^
-
-The licensing and account management is done through the :ref:`ref-authentication-token`
-obtained from the Lightly Platform (https://app.lightly.ai).
-
-The token will be used to authenticate your account.
-The authentication happens at every run of the worker. Make sure the Lightly Worker
-has a working internet connection and has access to https://api.lightly.ai.
-
-
-
-Download the Python client
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-We recommend to use the `lightly` Python package to interact with the Lightly API. It offers
-helper functions to create and delete datasets, schedule jobs, and access the results:
-
-.. code-block:: console
-
- pip install lightly
-
-See :ref:`rst-installing` for details.
-
-
-.. _docker-download-and-install:
-
-Docker Installation
-^^^^^^^^^^^^^^^^^^^^
-
-Lightly Worker requires docker to run. We highly recommend a docker installation
-that supports using GPUs for hardware acceleration using a Linux operating system.
-
-**Check if docker is installed:**
-
-.. code-block:: console
-
- sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base-ubuntu20.04 nvidia-smi
-
-You might get an error message like this if docker is installed but without GPU support
-
-.. code-block:: console
-
- docker could not select device driver with capabilities gpu
-
-
-If you don't have docker installed or without GPU support we recommend following
-our guide about :ref:`rst-docker-known-issues-faq-install-docker`.
-
-.. note::
- If you use a cloud instance (e.g. on AWS, GCP or Azure) Docker is most likely
- already installed!
-
-Download the Lightly Worker
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In short, installing the Docker image consists of the following steps:
-
-1. Make sure :code:`container-credentials.json` is on your machine where you want to run Lightly Worker.
- We need to access the private container registry of Lightly. You received
- a :code:`container-credentials.json` file from your account manager.
-
-2. Authenticate your docker account
- To be able to download docker images of the Lightly Worker you need to log in with these credentials.
-
- The following command will authenticate yourself to gain access to the Lightly Worker docker images.
- We assume :code:`container-credentials.json` is in your current directory.
-
- .. code-block:: console
-
- cat container-credentials.json | docker login -u _json_key --password-stdin https://eu.gcr.io
-
- You should see a message stating `Login Succeeded`.
-
-3. Pull the Lightly Worker docker image
- Using the following command you pull the latest image from our European cloud server:
-
- .. code-block:: console
-
- docker pull eu.gcr.io/boris-250909/lightly/worker:latest
-
- In case you experience any issues pulling the docker image after successful
- authentication :ref:`check out our FAQ section`.
-
- .. warning::
-
- Until version 2.1.8 the latest image was named `eu.gcr.io/boris-250909/lightly/sampling:latest`
- from version 2.2 onwards the image is now called `eu.gcr.io/boris-250909/lightly/worker:latest`.
- Please make sure to update any old Docker run commands to use the new image name.
-
-4. Shorten the name of the docker image using :code:`docker tag`
- The downloaded image has a long name. We can reduce it by making use of *docker tag*.
- The following experiments are using the following image name
- *lightly/worker:latest*.
- Create a new Docker tag using the following command:
-
- .. code-block:: console
-
- docker tag eu.gcr.io/boris-250909/lightly/worker:latest lightly/worker:latest
-
-
- .. note:: If you do not want to tag the image name you can replace lightly/worker:latest
- by eu.gcr.io/boris-250909/lightly/worker:latest for all commands in this documentation.
-
-
-Update the Lightly Worker
-^^^^^^^^^^^^^^^^^^^^^^^^^
-
-To update the Lightly Worker we simply need to pull the latest docker image.
-
-.. code-block:: console
-
- docker pull eu.gcr.io/boris-250909/lightly/worker:latest
-
-Don't forget to tag the image again after pulling it.
-
-.. code-block:: console
-
- docker tag eu.gcr.io/boris-250909/lightly/worker:latest lightly/worker:latest
-
-
-.. note:: You can download a specific version of the Docker image by indicating the version number
- instead of `latest`. We follow semantic versioning standards.
-
-
-Furthermore, we always recommend using the latest version of the Lightly\ **SSL** python package
-alongside the latest version of the Lightly Worker. You can update the
-pip package using the following command.
-
-.. code-block:: console
-
- pip install lightly --upgrade
-
-.. _docker-setup-sanity-check:
-
-Sanity Check
-^^^^^^^^^^^^
-
-**Next**, verify that the Lightly Worker is installed correctly by running the following command:
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --rm -it lightly/worker:latest sanity_check=True
-
-You should see an output similar to this one:
-
-.. code-block:: console
-
- [2022-05-02 20:37:27] Lightly Docker Solution v2.2.0
- [2022-05-02 20:37:27] Congratulations! It looks like the Lightly container is running!
-
-
-.. _worker-register:
-
-Register the Lightly Worker
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-**Finally**, start the Lightly Worker in waiting mode. In this mode, the worker will long-poll
-the Lightly API for new jobs to process. To do so, a worker first needs to be registered.
-
-
-.. note:: You only have to register each worker once. The registry is required because
- it's possible to have several workers registered at the same time working on different
- jobs in parallel.
-
-.. code-block:: python
-
- # execute the following code once to get a worker_id
- from lightly.api import ApiWorkflowClient
-
- client = ApiWorkflowClient(token='MY_AWESOME_TOKEN') # replace this with your token
- worker_id = client.register_compute_worker()
- print(worker_id)
-
-Store the `worker_id` in a secure location and then start the worker with
-
-
-.. code-block:: console
-
- docker run --shm-size="1024m" --gpus all --rm -it \
- lightly/worker:latest \
- token=MY_AWESOME_TOKEN \
- worker.worker_id=MY_WORKER_ID
-
-
-.. note:: All registered workers and their ids can be found under https://app.lightly.ai/compute/workers.
-
-All outputs generated by jobs will be stored in uploaded to the Lightly API as artifacts. Artifacts are explained in more detail in :ref:`docker-first-steps`.
-
-
-.. code-block:: console
-
- [2022-06-03 07:57:34] Lightly Docker Solution v2.2.0
- [2022-06-03 07:57:34] You are using docker build: Wed Jun 1 09:51:10 UTC 2022.
- [2022-06-03 07:57:34] Starting worker with id 61f27c8bf2f5d06164071415
- [2022-06-03 07:57:34] Worker started. Waiting for jobs...
-
-.. note:: In case the command fails because docker does not detect your GPU
- you want to make sure `nvidia-docker` is installed.
- You can follow the guide
- `here `_.
-
-
-Head on to :ref:`docker-first-steps` to see how to schedule a job!
diff --git a/docs/source/docker/images/lightly_docker_overview.png b/docs/source/docker/images/lightly_docker_overview.png
deleted file mode 100644
index 00f319afb..000000000
Binary files a/docs/source/docker/images/lightly_docker_overview.png and /dev/null differ
diff --git a/docs/source/docker/integration/dagster_aws.rst b/docs/source/docker/integration/dagster_aws.rst
deleted file mode 100644
index f794bfd3f..000000000
--- a/docs/source/docker/integration/dagster_aws.rst
+++ /dev/null
@@ -1,406 +0,0 @@
-
-.. _docker-integration-aws-dagster:
-
-Data Pre-processing Pipeline on AWS with Dagster
-================================================
-
-
-Introduction
---------------
-Data collection and pre-processing pipelines have become more and more automated in the recent years. The Lightly Worker can take on a crucial role
-in such a pipeline as it can reliably filter out redundant images and corrupted images with high throughput.
-
-This guide shows how to write a simple automated data pre-processing pipeline which performs the following steps:
-
-1. Download a random video from `Pexels `_.
-2. Upload the video to an S3 bucket.
-3. Run the Lightly Worker on the video to extract a diverse set of frames for further processing:
-
-Here, the first two steps simulate a data collection process.
-
-.. note::
-
- The datapool option of the Lightly Worker allows it to remember frames/images it has seen
- in past executions of the pipeline and ignore images which are too similar to already known ones.
-
-
-Dagster
----------
-Dagster is an open-source data orchestrator for machine learning. It enables building, deploying, and
-debugging data processing pipelines. Click `here `__ to learn more.
-
-
-Setting up the S3 Bucket
---------------------------
-If you don't have an S3 bucket already, follow `these `_ instructions to create one.
-For the purpose of this tutorial, name the bucket `lightly-aws-bucket`. If you want to use a different S3 bucket, remember to replace all occurences
-of `lightly-aws-bucket` in the rest of this guide.
-
-.. note::
- Make sure you have access to credentials to provide Lightly with `LIST` and `READ` access to the input bucket and
- with `LIST`, `READ`, and `WRITE` access to the output bucket. See :ref:`dataset-creation-gcloud-bucket`,
- :ref:`dataset-creation-aws-bucket`, and :ref:`dataset-creation-azure-storage` for help
- with configuring the different roles.
-
-Then, configure a dataset in the Lightly Platform which will represent the state of your datapool:
-
-.. code-block:: python
-
- from lightly.api import ApiWorkflowClient
- from lightly.openapi_generated.swagger_client.models.dataset_type import DatasetType
- from lightly.openapi_generated.swagger_client.models.datasource_purpose import DatasourcePurpose
-
- # Create the Lightly client to connect to the API.
- client = ApiWorkflowClient(token="YOUR_LIGHTLY_TOKEN")
-
- # Create a new dataset on the Lightly Platform.
- client.create_new_dataset_with_unique_name(
- 'my-datapool-name',
- DatasetType.IMAGES # can be DatasetType.VIDEOS when working with videos
- )
- print(f'Dataset id: {client.dataset_id}')
-
- ## AWS S3
- # Input bucket
- client.set_s3_config(
- resource_path="s3://lightly-aws-bucket/pexels",
- region='eu-central-1'
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- purpose=DatasourcePurpose.INPUT
- )
- # Output bucket
- client.set_s3_config(
- resource_path="s3://lightly-aws-bucket/outputs/",
- region='eu-central-1'
- access_key='S3-ACCESS-KEY',
- secret_access_key='S3-SECRET-ACCESS-KEY',
- purpose=DatasourcePurpose.LIGHTLY
- )
-
-Make sure to note the dataset id somewhere safe as you'll need it throughout this tutorial.
-
-
-
-Setting up the EC2 Instance
------------------------------
-The next step is to set up the EC2 instance. For the purposes of this tutorial,
-it's recommended to pick an instance with a GPU (like the g4dn.xlarge) and the "Deep Learning AMI (Ubuntu 18.04) Version 48.0" AMI.
-See `this guide `_ to get started. Connect to the instance.
-
-
-Next, the Lightly Worker should be installed on the instance. Please follow the instructions `here `__.
-Make sure you have the API token and the worker id from the setup steps. Start the worker in waiting mode with the following arguments:
-
-.. code-block:: shell
-
- # general
- IMAGE=lightly/worker:latest
-
- OUTPUT_DIR=/home/ubuntu/output_dir/
-
- # api
- LIGHTLY_TOKEN=YOUR_LIGHTLY_TOKEN
- WORKER_ID=MY_WORKER_ID
-
- # run command
- # this makes the Lightly Worker start up and wait for jobs
- docker run --shm-size="1024m" --gpus all --rm -it \
- lightly/worker:latest \
- token=${LIGHTLY_TOKEN} \
- worker.worker_id=${WORKER_ID}
-
-
-Integration
--------------
-
-Before you start, install the following dependencies:
-
-
-.. code:: console
-
- pip install pypexels
- pip install boto3
- pip install dagster
-
-
-Now that everything is setup, begin with building the data processing pipeline. Dagster's pipelines consist of several `solids` which can
-be chained one after each other. Put each solid in a separate file and aim for the following directory structure:
-
-.. code:: console
-
- ./source
- ├── aws_example_pipeline.py
- └── solids
- ├── aws
- │ ├── lightly.py
- │ └── s3.py
- └── pexels.py
-
-
-The following code is the content of `pexels.py` and represents first solid in the pipeline.
-It downloads a random video from `Pexels `_ and saves it in the current
-working directory. Don't forget to set the `PEXELS_API_KEY`.
-
-
-.. code-block:: python
-
- import os
- import string
- import random
- import requests
-
- from typing import List
-
- from pypexels import PyPexels
-
- from dagster import solid
-
-
- PEXELS_API_KEY = 'YOUR_PEXELS_API_KEY'
-
-
- class PexelsClient:
- """Pexels client to download a random popular video.
-
- """
-
- def __init__(self):
- self.api = PyPexels(api_key=PEXELS_API_KEY)
-
-
- def random_filename(self, size_: int = 8):
- """Generates a random filename of uppercase letters and digits.
-
- """
- chars = string.ascii_uppercase + string.digits
- return ''.join(random.choice(chars) for _ in range(size_)) + '.mp4'
-
-
- def download_video(self, root: str):
- """Downloads a random popular video from pexels and saves it.
-
- """
- popular_videos = self.api.videos_popular(per_page=40)._body['videos']
- video = random.choice(popular_videos)
- video_file = video['video_files'][0]
- video_link = video_file['link']
-
- video = requests.get(video_link)
-
- path = os.path.join(root, self.random_filename())
- with open(path, 'wb') as outfile:
- outfile.write(video._content)
-
- return path
-
-
- @solid
- def download_random_video_from_pexels() -> str:
- """Dagster solid to download a random pexels video to the current directory.
-
- Returns:
- The path to the downloaded video.
-
- """
-
- client = PexelsClient()
- path = client.download_video('./')
-
- return path
-
-
-The next solid in the pipeline (`s3.py`) uploads the video to the S3 bucket. It saves the video
-in a randomly created subfolder in the S3 bucket.
-Set the `BUCKET_NAME` and `REGION_NAME` to your bucket name and region of the EC2 instance.
-
-
-.. code-block:: python
-
- import os
- import string
- import random
-
- import boto3
- from botocore.exceptions import ClientError
-
- from dagster import solid
-
-
- BUCKET_NAME: str = 'lightly-aws-bucket'
- REGION_NAME: str = 'YOUR_REGION_NAME' # e.g. eu-central-1
-
-
- class S3Client:
- """S3 client to upload files to a bucket.
-
- """
-
- def __init__(self):
- self.s3 = boto3.client('s3', region_name=REGION_NAME)
-
-
- def random_subfolder(self, size_: int = 8):
- """Generates a random subfolder name of uppercase letters and digits.
-
- """
- chars = string.ascii_uppercase + string.digits
- return ''.join(random.choice(chars) for _ in range(size_))
-
-
- def upload_file(self, filename: str):
- """Uploads the file at filename to the s3 bucket.
-
- Generates a random subfolder so the file will be stored at:
- >>> BUCKET_NAME/RANDOM_SUBFOLDER/basefilename.mp4
-
- """
-
- # upload file to lightly-aws-bucket/pexels/RANDOM_STRING/basename.mp4
- object_name = os.path.join(
- 'pexels',
- self.random_subfolder(),
- os.path.basename(filename)
- )
-
- # Upload the file
- try:
- self.s3.upload_file(filename, BUCKET_NAME, object_name)
- except ClientError as e:
- print(e)
- return None
-
- return object_name
-
-
- @solid
- def upload_video_to_s3(filename: str) -> str:
- """Dagster solid to upload a video to an s3 bucket.
-
- Args:
- filename:
- Path to the video which should be uploaded.
-
- Returns:
- The name of the object in the s3 bucket.
-
- """
-
- s3_client = S3Client()
- object_name = s3_client.upload_file(filename)
-
- return object_name
-
-
-Finally, the last solid in the pipeline (`lightly.py`) runs the Lightly Worker on the newly collected videos.
-Set the `YOUR_LIGHTLY_TOKEN`, `YOUR_DATASET_ID` accordingly.
-
-.. code-block:: python
-
- import os
- import time
-
- from dagster import solid
-
- LIGHTLY_TOKEN: str = 'YOUR_LIGHTLY_TOKEN'
- DATASET_ID: str = 'YOUR_DATASET_ID'
-
-
-
- class LightlyClient:
- """Lightly client to run the Lightly Worker.
-
- """
-
- def __init__(self, token: str, dataset_id: str):
- self.token = token
- self.dataset_id = dataset_id
-
- def run_lightly_worker():
- """Runs the Lightly Worker on the EC2 instance.
-
- """
-
- client = ApiWorkflowClient(
- token=self.token,
- dataset_id=self.dataset_id
- )
- client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False,
- },
- selection_config={
- "n_samples": 50,
- "strategies": [
- {
- "input": {
- "type": "EMBEDDINGS"
- },
- "strategy": {
- "type": "DIVERSITY"
- }
- }
- ]
- }
- )
-
-
- @solid
- def run_lightly_worker() -> None:
- """Dagster solid to run Lightly Worker on a remote EC2 instance.
-
- """
-
- lightly_client = LightlyClient(LIGHTLY_TOKEN, DATASET_ID)
- lightly_client.run_lightly_worker()
-
-
-To put the solids together in a single pipeline, save the following code in `aws_example_pipeline.py`:
-
-
-.. code-block:: python
-
- from dagster import pipeline
-
- from solids.pexels import download_random_video_from_pexels
- from solids.aws.s3 import upload_video_to_s3
- from solids.aws.lightly import run_lightly_onprem
-
-
- @pipeline
- def aws_example_pipeline():
- """Example data processing pipeline with Lightly on AWS.
-
- The pipeline performs the following three steps:
- - Download a random video from pexels
- - Upload the video to an s3 bucket
- - Run the Lightly pre-selection solution on the video and store the
- extracted frames in the s3 bucket
-
- """
- file_name = download_random_video_from_pexels()
- upload_video_to_s3(file_name)
- run_lightly_onprem()
-
-
-Dagster allows to visualize pipelines in a web interface. The following command
-shows the above pipeline on `127.0.0.1:3000`:
-
-.. code-block:: console
-
- dagit -f aws_example_pipeline.py
-
-
-Finally, you can execute the pipeline with the following command:
-
-
-.. code-block:: console
-
- dagster pipeline execute -f aws_example_pipeline.py
-
-For automatic execution of the pipeline you can install a cronjob, trigger the pipeline
-upon certain events, or deploy it to an `AWS EC2 or GCP GCE `_.
\ No newline at end of file
diff --git a/docs/source/docker/integration/examples/create_dataset.py b/docs/source/docker/integration/examples/create_dataset.py
deleted file mode 100644
index ce8e9aee8..000000000
--- a/docs/source/docker/integration/examples/create_dataset.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import lightly
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="LIGHTLY_TOKEN")
-
-# Create a new dataset on the Lightly Platform.
-client.create_dataset("dataset-name")
-
-# Connect the dataset to your cloud bucket.
-
-# AWS S3
-client.set_s3_config(
- resource_path="s3://bucket/dataset/",
- region="eu-central-1",
- access_key="ACCESS-KEY",
- secret_access_key="SECRET",
- thumbnail_suffix=None,
-)
-
-# Google Cloud Storage
-import json
-
-client.set_gcs_config(
- resource_path="gs://bucket/dataset/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials.json"))),
- thumbnail_suffix=None,
-)
-
-# Azure Blob Storage
-client.set_azure_config(
- container_name="container/dataset/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- thumbnail_suffix=None,
-)
diff --git a/docs/source/docker/integration/examples/trigger_job.py b/docs/source/docker/integration/examples/trigger_job.py
deleted file mode 100644
index e60831c24..000000000
--- a/docs/source/docker/integration/examples/trigger_job.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import time
-
-from lightly.openapi_generated.swagger_client import (
- DockerRunScheduledState,
- DockerRunState,
-)
-
-# You can reuse the client from previous scripts. If you want to create a new
-# one you can uncomment the following line:
-# import lightly
-# client = lightly.api.ApiWorkflowClient(token="LIGHTLY_TOKEN", dataset_id="DATASET_ID")
-
-# Schedule the compute run using a custom config.
-# You can easily edit the values according to your needs.
-
-
-scheduled_run_id = client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- },
- selection_config={
- "n_samples": 50,
- "strategies": [
- {"input": {"type": "EMBEDDINGS"}, "strategy": {"type": "DIVERSITY"}}
- ],
- },
- lightly_config={
- "loader": {
- "batch_size": 16,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 100, "precision": 32},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.5,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
-
-"""
-Optionally, You can use this code to track and print the state of the compute worker.
-The loop will end once the compute worker run has finished, was canceled or aborted/failed.
-"""
-for run_info in client.compute_worker_run_info_generator(
- scheduled_run_id=scheduled_run_id
-):
- print(
- f"Compute worker run is now in state='{run_info.state}' with message='{run_info.message}'"
- )
-
-if run_info.ended_successfully():
- print("SUCCESS")
-else:
- print("FAILURE")
diff --git a/docs/source/docker/integration/images/aws-s3-bucket-list.png b/docs/source/docker/integration/images/aws-s3-bucket-list.png
deleted file mode 100644
index ab2012424..000000000
Binary files a/docs/source/docker/integration/images/aws-s3-bucket-list.png and /dev/null differ
diff --git a/docs/source/docker/integration/images/schedule-compute-run-config.png b/docs/source/docker/integration/images/schedule-compute-run-config.png
deleted file mode 100644
index 4f8d83521..000000000
Binary files a/docs/source/docker/integration/images/schedule-compute-run-config.png and /dev/null differ
diff --git a/docs/source/docker/integration/images/schedule-compute-run.png b/docs/source/docker/integration/images/schedule-compute-run.png
deleted file mode 100644
index e7a3cc75e..000000000
Binary files a/docs/source/docker/integration/images/schedule-compute-run.png and /dev/null differ
diff --git a/docs/source/docker/integration/images/webapp-explore-after-docker.jpg b/docs/source/docker/integration/images/webapp-explore-after-docker.jpg
deleted file mode 100644
index 19ad8848f..000000000
Binary files a/docs/source/docker/integration/images/webapp-explore-after-docker.jpg and /dev/null differ
diff --git a/docs/source/docker/integration/overview.rst b/docs/source/docker/integration/overview.rst
deleted file mode 100644
index 81e509a84..000000000
--- a/docs/source/docker/integration/overview.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-Integration
-===================================
-Here you learn how to integrate the Lightly Worker into data pre-processing pipelines.
-
-
-.. toctree::
- :maxdepth: 1
-
- docker_with_datasource.rst
- docker_trigger_from_api.rst
- dagster_aws.rst
diff --git a/docs/source/docker/known_issues_faq.rst b/docs/source/docker/known_issues_faq.rst
deleted file mode 100644
index 175f7b311..000000000
--- a/docs/source/docker/known_issues_faq.rst
+++ /dev/null
@@ -1,328 +0,0 @@
-.. _rst-docker-known-issues-faq:
-
-Known Issues and FAQ
-===================================
-
-
-.. _rst-docker-known-issues-faq-install-docker:
-
-Installing Docker with GPU support
--------------------------------------
-If you install docker using :code:`apt-get install docker` or by following the
-`official docker installation docs `_
-you might not install the version that also supports GPU drivers.
-
-Instead, you should follow the
-`docker installation docs from Nvidia `_.
-
-Here is a quick summary for the shell commands you need:
-
-1. Setup package repository (to find the nvidia docker package)
- .. code-block:: console
-
- distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
- && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
- && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
- sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
- sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
-2. Update the repository
- .. code-block:: console
-
- sudo apt-get update
-3. Install nvidia-docker
- .. code-block:: console
-
- sudo apt-get install -y nvidia-docker2
-4. Restart the docker service
- .. code-block:: console
-
- sudo systemctl restart docker
-5. Test installation by running `nvidia-smi` within a docker container
- .. code-block:: console
-
- sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base-ubuntu20.04 nvidia-smi
-
- You should see something like this now:
-
- .. code-block:: console
-
- +-----------------------------------------------------------------------------+
- | NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 |
- |-------------------------------+----------------------+----------------------+
- | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
- | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
- | | | MIG M. |
- |===============================+======================+======================|
- | 0 Tesla T4 On | 00000000:00:1E.0 Off | 0 |
- | N/A 34C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |
- | | | N/A |
- +-------------------------------+----------------------+----------------------+
-
- +-----------------------------------------------------------------------------+
- | Processes: |
- | GPU GI CI PID Type Process name GPU Memory |
- | ID ID Usage |
- |=============================================================================|
- | No running processes found |
- +-----------------------------------------------------------------------------+
-6. Make sure we can run docker as non root user (recommended for security).
- We can follow the instructions from the official docker docs https://docs.docker.com/engine/install/linux-postinstall/
-
- .. code-block:: console
-
- sudo groupadd docker
-
- .. code-block:: console
-
- sudo usermod -aG docker $USER
-
- .. code-block:: console
-
- newgrp docker
-
-7. Test whether we can run docker with GPU support and as non root user
-
- .. code-block:: console
-
- docker run --rm --gpus all nvidia/cuda:11.0.3-base-ubuntu20.04 nvidia-smi
-
-
-Lightly Worker is slow when working with long videos
----------------------------------------------------
-
-We are working on this issue internally. For now we suggest to split the large
-videos into chunks. You can do this using ffmpeg and without losing quality.
-The following code just breaks up the video in a way that no re-encoding is needed.
-
-.. code-block:: console
-
- ffmpeg -i input.mp4 -c copy -map 0 -segment_time 01:00:00 -f segment -reset_timestamps 1 output%03d.mp4
-
-What exactly happens here?
-
-- `input.mp4`, this is your input video
-- `-c copy -map 0`, this makes sure we just copy and don't re-encode the video
-- `-segment_time 01:00:00 -f segment`, defines that we want chunks of 1h each
-- `-reset_timestamps 1`, makes sure we reset the timestamps (each video starts from 0)
-- `output%03d.mp4`, name of the output vidoes (output001.mp4, output002.mp4, ...)
-
-Lightly Worker Crashes when running with GPUs
--------------------------------------
-
-You run the docker with `--gpus all` and encounter the following error?
-
-.. code-block:: console
-
- Error response from daemon: could not select device driver "" with capabilities: [[gpu]].
-
-This error might be caused because your docker installation does not support GPUs.
-
-Try to install `nvidia-docker` following the guide
-`here `_.
-
-
-Shared Memory Error when running Lightly Worker
------------------------------------------------
-
-The following error message appears when the docker runtime has not enough
-shared memory. By default Docker uses 64 MBytes. However, when using multiple
-workers for data fetching :code:`lightly.loader.num_workers` there might be not enough.
-
-.. code-block:: console
-
- ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
- Traceback (most recent call last):
- File "/opt/conda/envs/env/lib/python3.7/multiprocessing/queues.py", line 236, in _feed
- obj = _ForkingPickler.dumps(obj)
- File "/opt/conda/envs/env/lib/python3.7/multiprocessing/reduction.py", line 51, in dumps
- cls(buf, protocol).dump(obj)
- File "/opt/conda/envs/env/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 321, in reduce_storage
- fd, size = storage._share_fd_()
- RuntimeError: unable to write to file
-
-To solve this problem we need to reduce the number of workers or
-increase the shared memory for the docker runtime.
-
-Lightly Worker determines the number of CPU cores available and sets the number
-of workers to the same number. If you have a machine with many cores but not so much
-memory (e.g. less than 2 GB of memory per core) it can happen that you run out
-of memory and you rather want to reduce
-the number of workers intead of increasing the shared memory.
-
-You can change the shared memory from 64 MBytes to 512 MBytes by
-adding `--shm-size="512m"` to the docker run command:
-
-.. code-block:: console
-
- # example of docker run with setting shared memory to 512 MBytes
- docker run --shm-size="512m" --gpus all
-
- # you can also increase it to 2 Gigabytes using
- docker run --shm-size="2G" --gpus all
-
-
-
-CUDA error: all CUDA-capable devices are busy or unavailable
-------------------------------------------------------------
-
-It might happen that you bump into this error when running the Lightly Worker
-to process a job.
-
-.. code-block:: console
-
- CUDA error: all CUDA-capable devices are busy or unavailable CUDA kernel
- errors might be asynchronously reported at some other API call,so the
- stacktrace below might be incorrect. For debugging consider
- passing CUDA_LAUNCH_BLOCKING=1.
-
-The reason this error occurs is most likely that some process on your machine
-reserved resources on the GPU without properly releasing them. It can be
-that this is a particular software running. It can also be that a combination
-of CUDA version and other software caused this.
-
-Very often you might be lucky and a simple reboot will resolve the problem as
-during the reboot all GPU resources will be freshly allocated. However, if a
-reboot does not help we suggest you to use another CUDA version on your system.
-
-
-Lightly Worker crashes because of too many open files
------------------------------------------------
-
-The following error message appears when the docker runtime has not enough
-file handlers. By default Docker uses 1024. However, when using multiple
-workers for data fetching `lightly.loader.num_workers` this might be not
-enough. As file handlers are used at many different parts of the code,
-the actual error message may differ. Internet connections like for
-connecting to the Lightly API also use file handlers.
-
-.. code-block:: console
-
-
-
-To solve this problem we need to increase the number of file handlers for the
-docker runtime.
-
-You can change the number of file handlers to 90000 by adding
-`--ulimit nofile=90000:90000` to the docker run command:
-
-.. code-block:: console
-
- # example of docker run with 90000 file handlers
- docker run --ulimit nofile=90000:90000 --gpus all
-
-More documentation on docker file handlers is providided `here.
-`_
-
-
-Permission denied for input created with sudo
------------------------------------------------
-
-There are some problems if the input directory was created with root/ sudo and
-the container tries to access it. This can be solved by making the files readable:
-
-.. code-block:: console
-
- # make subdirectories browsable
- find MY_INPUT_DIR -type d -exec chmod 755 {} +
-
- # make the files themselves readable
- find MY_INPUT_DIR -type f -exec chmod 644 {} +
-
-
-Error when using S3 fuse and mounting to docker
-------------------------------------------------
-
-If you use docker in combination with S3 fuse you might stumble across an issue
-that the docker container can't create the mount path for the input directory.
-
-.. code-block:: console
-
- docker: Error response from daemon: error while creating mount source path \
- '/home/ubuntu/mydataset/': mkdir /home/ubuntu/mydataset: file exists.
-
-You can resolve this problem by following the guide here:
-https://stackoverflow.com/a/61686833
-
-1. uncomment **user_allow_other** option in the **/etc/fuse.conf** file
-2. when you mount the bucket using s3fs use the **-o allow_other** option.
-
- .. code-block:: console
-
- s3fs my-s3-bucket /s3-mount -o allow_other -o use_cache=/tmp
-
-
-Token printed to shared stdout or logs
---------------------------------------
-
-The token (along with other Hydra configuration) will be printed to stdout, and so could appear in logs in an automated setup.
-
-.. code-block:: console
-
- docker run --rm -it \
- -v {OUTPUT_DIR}:/home/shared_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- ...
-
-This can be avoided by setting your `token` via the `LIGHTLY_TOKEN` environment variable:
-
-.. code-block:: console
-
- docker run --rm -it \
- -e LIGHTLY_TOKEN=MYAWESOMETOKEN
- -v {OUTPUT_DIR}:/home/shared_dir \
- lightly/worker:latest \
- ...
-
-
-.. _rst-docker-known-issues-faq-pulling-docker:
-
-No permission to pull the docker image
---------------------------------------
-
-Please make sure the authentication succeeded as described in the
-:ref:`docker-download-and-install` guide.
-
-If you still can't pull the docker image it might be that the docker config
-is causing the problem.
-
-You can check the config using the following command:
-
-.. code-block:: console
-
- cat ~/.docker/config.json
-
-You should see a section with the key for authentication. If you also see
-a section about the `credHelpers` they might overrule the authentication.
-
-The `credHelpers` can overrule the key for certain URLs. This can lead to
-permission errors pulling the docker image.
-
-The Lightly Worker docker images are hosted in the European location. Therefore,
-it's important that pulling from the `eu.gcr.io` domain is using
-the provided credentials.
-
-
-There are two ways to solve the problem:
-
-- You can delete the config and run the authentication again.
-
- .. code-block:: console
-
- rm ~/.docker/config.json
-
- cat container-credentials.json | docker login -u _json_key --password-stdin https://eu.gcr.io
-
-- You can work with two configs. We recommend creating a dedicated folder
- for the Lightly Worker docker config.
-
- .. code-block:: console
-
- mkdir -p ~/.docker_lightly/
-
- cat container-credentials.json | docker --config ~/.docker_lightly/ login -u _json_key --password-stdin https://eu.gcr.io
-
- docker --config ~/.docker_lightly/ pull eu.gcr.io/boris-250909/lightly/worker:latest
-
-Whenever you're pulling a new image (e.g. updating Lightly Worker) you would need to
-pass it the corresponding config using the `--config` parameter.
\ No newline at end of file
diff --git a/docs/source/docker/overview.rst b/docs/source/docker/overview.rst
deleted file mode 100644
index 13d2d525b..000000000
--- a/docs/source/docker/overview.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-Lightly Worker
-==============
-
-We all know that sometimes when working with ML data we deal with really BIG datasets. The cloud solution is great for exploration, prototyping
-and an easy way to work with lightly. But there is more!
-
-.. figure:: images/lightly_docker_overview.png
- :align: center
- :alt: Alt text
- :figclass: align-center
-
-With the introduction of our on-premise solution, you can **process larger datasets completely on your end without data leaving your infrastructure**.
-We worked hard to make this happen and are very proud to present you with the following specs:
-
-* :ref:`docker-active-learning` using Lightly Worker
-
-* See your docker runs live in the Lightly Platform (see :ref:`ref-docker-runs`)
-
-* Lightly Worker has built-in pretagging models (see :ref:`docker-pretagging`)
-
- * Use this feature to pre-label your dataset or to only select images which contain certain objects
-
- * Supported object categories are: bicycle, bus, car, motorcycle, person, train, truck
-
-* Select from more than 10 Million samples within a few hours!
-
-* Runs directly with videos without prior extraction of the frames!
-
-* Wrapped in a docker container (no setup required if your system supports docker)
-
-* Configurable
-
- * Use stopping conditions for the selection strategy such as minimum distance between
- two samples
-
- * Use various selection strategies
-
- * Check for corrupt files and report them
-
- * Check for exact duplicates and report them
-
- * We expose the full Lightly\ **SSL** OSS framework config
-
-* Automated reporting of the datasets for each run
-
- * PDF report with histograms, plots, statistics, and much more ...
-
-* Hand-optimized code (to instruction-level)
-
- * Multithreaded
-
- * SIMD instructions
-
-* Minimal hardware requirements:
-
- * 1 CPU core
-
- * 4 GB free RAM
-
-* Recommended hardware:
-
- * see :ref:`hardware-recommendations`
-
-.. toctree::
- :maxdepth: 1
-
- getting_started/setup.rst
- getting_started/first_steps.rst
- getting_started/selection.rst
- advanced/overview.rst
- integration/overview.rst
- configuration/configuration.rst
- examples/overview.rst
- security/security.rst
- known_issues_faq.rst
- getting_started/hardware_recommendations.rst
diff --git a/docs/source/docker/security/images/lightly-cloud-architecture.png b/docs/source/docker/security/images/lightly-cloud-architecture.png
deleted file mode 100644
index 7a6caacc0..000000000
Binary files a/docs/source/docker/security/images/lightly-cloud-architecture.png and /dev/null differ
diff --git a/docs/source/docker/security/security.rst b/docs/source/docker/security/security.rst
deleted file mode 100644
index db841eadd..000000000
--- a/docs/source/docker/security/security.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-Security
-==========
-
-Security and data privacy is very important to us. In this section
-you find all security related information.
-Legal documents such as Privay Notice, T&C, DPA are available under
-`https://lightly.ai/legal `_.
-
-.. note:: These documents apply to all personal, team and custom account plans.
-
-Architecture Overview
-----------------------
-
-Here you find an overview of the cloud architecture.
-
-**A few important things to note:**
-
-- Data storage and processing occurs within the clients own cloud infrastructure
-- Lightly only needs permission to list files within your cloud storage
- (S3, GCS, Azure) and to create signed URLs
-- Lightly assets such as images, videos, sequences, frames, objects or thumbnails are
- always stored within the clients storage. Any additional data such as metadata, predictions or
- any other non-sensitive data used to manage the datasets is stored in secured
- databases within Lightly's own infrastructure
-- Authentication is provided through our partner Auth0. Additional services such
- as 2FA/MFA, SAML can be added upon request.
-
-.. figure:: images/lightly-cloud-architecture.png
- :align: center
- :alt: Image of Lightly Architecture
- :figclass: align-center
-
-How does your data flow around?
--------------------------------
-
-We differentiate between usage data and the actual samples stored in
-your cloud storage. Samples can be images
-or videos and their subtypes such as sequences, frames or object crops.
-Samples typically contain sensitive information (PII) and we setup
-the whole Lightly architecture in a way that you can fully restrict sensitive data from
-leaving your cloud environment (see :ref:`dataset-creation-aws-bucket-minimum-policy`).
-
-**Whenever you process a new dataset using Lightly the following steps happen:**
-
-1. You create a new job using the Lightly Python API. The job contains
- information about the location of the data (bucket path to S3, GCP or Azure)
- as well as the parameters of how the data should be processed.
-2. After the job has been created, it can be processed by the Lightly Worker. The
- Lightly Worker typically runs on a GPU instance within your cloud environment.
- It uses the job information to load the data directly from the cloud bucket using
- signed URLs created by the Lightly API.
-3. At the end of the job, the Lightly Worker pushes part of the results from the selection
- to the Lightly cloud. The Lightly Cloud gets data such as which filenames have been
- selected, the embeddings, the metadata and predictions.
- The other parts of the results such as thumbnails and frames that could contain
- sensitive information are stored in your bucket within your cloud infrastructure.
-
-**This setup has several advantages:**
-
-- The large amount of input data that could contain thousands of videos or millions of frames
- is only moved around within your cloud infrastructure. If the data is in the same region,
- there is no egress traffic cost and the latency is very low.
-- Lightly never stores sensitive data such as samples and therefore you don't have to worry
- about this
-- This setup allows for additional hardening of the access rules as the Lightly Cloud does not
- need to read the actual data in your bucket (see :ref:`dataset-creation-aws-bucket-minimum-policy`)
-
-What data is stored where?
---------------------------
-
-**Data stored on Lightly Cloud:**
-
-- Embeddings - contain the filenames of the sample and a vector representation
- describing it
-- Metadata - any metadata provided to the data selection workflow is cached
- for faster access and visualization in the user interface
-- Predictions - similar to metadata, predictions are cached as well
-
-**Data stored on your Cloud:**
-
-- Samples - the actual images and videos
-- extracted crops, frames or thumbnails
-
-
-.. note:: Lightly does caching of the predictions and metadata for faster retrieval.
- However, the input images as well as thumbnails or frames from videos are
- always fetched from your connected cloud bucket directly. They are never
- cached on Lightly infrastructure!
\ No newline at end of file
diff --git a/docs/source/docker_archive/advanced/active_learning.rst b/docs/source/docker_archive/advanced/active_learning.rst
deleted file mode 100644
index 64db65ee2..000000000
--- a/docs/source/docker_archive/advanced/active_learning.rst
+++ /dev/null
@@ -1,339 +0,0 @@
-.. _ref-docker-active-learning:
-
-Active Learning
-===============
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Lightly makes use of active learning scores to select the samples which will yield
-the biggest improvements of your machine learning model. The scores are calculated
-on-the-fly based on model predictions and provide the selection algorithm with feedback
-about the uncertainty of the model for the given sample.
-
-.. note:: Note that the active learning features require a minimum
- Lightly Worker of version 2.2. You can check your installed version of the
- Lightly Worker by running the :ref:`ref-docker-setup-sanity-check`.
-
-Prerequisites
---------------
-In order to do active learning with Lightly, you will need the following things:
-
-- The installed Lightly docker (see :ref:`ref-docker-setup`)
-- A dataset with a configured datasource (see :ref:`ref-docker-with-datasource-datapool`)
-- Your predictions uploaded to the datasource (see :ref:`ref-docker-datasource-predictions`)
-
-.. note::
-
- The dataset does not need to be new! For example, an initial selection without
- active learning can be used to train a model. The predictions from this model
- can then be used to improve your dataset by adding new images to it through active learning.
-
-
-Selection
--------------------------
-Once you have everything set up as described above, you can do an active learning
-iteration by specifying the following three things in your Lightly docker config:
-
-- `method`
-- `active_learning.task_name`
-- `active_learning.score_name`
-
-Here's an example of how to configure an active learning run:
-
-
-.. tabs::
-
- .. tab:: Web App
-
- **Trigger the Job**
-
- To trigger a new job you can click on the schedule run button on the dataset
- overview as shown in the screenshot below:
-
- .. figure:: ../integration/images/schedule-compute-run.png
-
- After clicking on the button you will see a wizard to configure the parameters
- for the job.
-
- .. figure:: ../integration/images/schedule-compute-run-config.png
-
- In this example we have to set the `active_learning.task_name` parameter
- in the docker config. Additionally, we set the `method` to `coral` which
- simultaneously considers the diversity and the active learning scores of
- the samples. All other settings are default values. The
- resulting docker config should look like this:
-
- .. literalinclude:: code_examples/active_learning_worker_config.txt
- :caption: Docker Config
- :language: javascript
-
- The Lightly config remains unchanged.
-
- .. tab:: Python Code
-
- .. literalinclude:: code_examples/python_run_active_learning.py
-
-
-After the worker has finished its job you can see the selected images with their
-active learning score in the web-app.
-
-
-Active Learning with Custom Scores (not recommended as of March 2022)
-----------------------------------------------------------------------
-
-.. note::
- This is not recommended anymore as of March 2022 and will be deprecated in the future!
-
-
-For running an active learning step with the Lightly docker, we need to perform
-3 steps:
-
-1. Create an `embeddings.csv` file. You can use your own models or the Lightly docker for this.
-2. Add your active learning scores as an additional column to the embeddings file.
-3. Use the Lightly docker to perform an active learning iteration on the scores.
-
-
-Create Embeddings
-^^^^^^^^^^^^^^^^^
-
-You can create embeddings using your own model. Just make sure the resulting
-`embeddings.csv` file matches the required format:
-:ref:`ref-cli-embeddings-lightly`.
-
-Alternatively, you can run the docker as usual and as described in the
-:ref:`rst-docker-first-steps` section.
-The only difference is that you set the number of samples to be selected to 1.0,
-as this simply creates an embedding of the full dataset.
-
-E.g. create and run a bash script with the following content:
-
-.. code::
-
- # Have this in a step_1_run_docker_create_embeddings.sh
- INPUT_DIR=/path/to/your/dataset
- SHARED_DIR=/path/to/shared
- OUTPUT_DIR=/path/to/output
-
- LIGHTLY_TOKEN= # put your token here
- N_SAMPLES=1.0
-
- docker run --gpus all --rm -it \
- -v ${INPUT_DIR}:/home/input_dir:ro \
- -v ${SHARED_DIR}:/home/shared_dir:ro \
- -v ${OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=${LIGHTLY_TOKEN} \
- lightly.loader.num_workers=4 \
- stopping_condition.n_samples=${N_SAMPLES}\
- method=coreset \
- enable_training=True \
- lightly.trainer.max_epochs=20
-
-Running it will create a terminal output similar to the following:
-
-.. code-block::
-
- [2021-09-29 13:32:11] Loading initial dataset...
- [2021-09-29 13:32:11] Found 372 input images in input_dir.
- [2021-09-29 13:32:11] Lightly On-Premise License is valid
- [2021-09-29 13:32:11] Checking for corrupt images (disable with enable_corruptness_check=False).
- Corrupt images found: 0: 100%|██████████████████| 372/372 [00:01<00:00, 310.35it/s]
- [2021-09-29 13:32:14] Training self-supervised model.
- GPU available: True, used: True
- [2021-09-29 13:32:57,696][lightning][INFO] - GPU available: True, used: True
- TPU available: None, using: 0 TPU cores
- [2021-09-29 13:32:57,697][lightning][INFO] - TPU available: None, using: 0 TPU cores
- LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
- [2021-09-29 13:32:57,697][lightning][INFO] - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
-
- | Name | Type | Params
- -----------------------------------------
- 0 | model | SimCLR | 11.2 M
- 1 | criterion | NTXentLoss | 0
- -----------------------------------------
- 11.2 M Trainable params
- 0 Non-trainable params
- [2021-09-29 13:34:29,772][lightning][INFO] - Saving latest checkpoint...
- Epoch 19: 100%|████████████████████████████████| 23/23 [00:04<00:00, 5.10it/s, loss=2.52, v_num=0]
- [2021-09-29 13:34:29] Embedding images.
- Compute efficiency: 0.90: 100%|█████████████████████████| 24/24 [00:01<00:00, 21.85it/s]
- [2021-09-29 13:34:31] Saving embeddings to output_dir/2021-09-29/13:32:11/data/embeddings.csv.
- [2021-09-29 13:34:31] Unique embeddings are stored in output_dir/2021-09-29/13:32:11/data/embeddings.csv
- [2021-09-29 13:34:31] Normalizing embeddings to unit length (disable with normalize_embeddings=False).
- [2021-09-29 13:34:31] Normalized embeddings are stored in output_dir/2021-09-29/13:32:11/data/normalized_embeddings.csv
- [2021-09-29 13:34:31] Sampling dataset with stopping condition: n_samples=372
- [2021-09-29 13:34:31] Skipped sampling because the number of remaining images is smaller than the number of requested samples.
- [2021-09-29 13:34:31] Writing report to output_dir/2021-09-29/13:32:11/report.pdf.
- [2021-09-29 13:35:04] Writing csv with information about removed samples to output_dir/2021-09-29/13:32:11/removed_samples.csv
- [2021-09-29 13:35:04] Done!
-
-By running it, this will create an `embeddings.csv` file
-in the output directory. Locate it and save the path to it.
-E.g. It may be found under
-`/path/to/output/2021-09-28/15:47:34/data/embeddings.csv`
-
-It should look similar to this:
-
-+----------------+--------------+--------------+--------------+--------------+---------+
-| filenames | embedding_0 | embedding_1 | embedding_2 | embedding_3 | labels |
-+================+==============+==============+==============+==============+=========+
-| cats/0001.jpg | 0.29625183 | 0.50055015 | 0.36491454 | 0.8156051 | 0 |
-+----------------+--------------+--------------+--------------+--------------+---------+
-| dogs/0005.jpg | 0.36491454 | 0.29625183 | 0.38491454 | 0.36491454 | 1 |
-+----------------+--------------+--------------+--------------+--------------+---------+
-| cats/0014.jpg | 0.8156051 | 0.59055015 | 0.29625183 | 0.50055015 | 0 |
-+----------------+--------------+--------------+--------------+--------------+---------+
-
-
-Add Active Learning Scores
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You can use the predictions from your model as active learning scores.
-
-.. note:: You can also use your own scorers. Just make sure that you get a value
- between `0.0` and `1.0` for each sample. A number close to `1.0` should
- indicate a very important sample you want to be selected with a higher
- probability.
-
-We provide a simple Python script to append a list of `scores` to the `embeddings.csv` file.
-
-.. code-block:: python
-
- # Have this in a step_2_add_al_scores.py
-
- from typing import Iterable
- import csv
- import os
-
- """
- Run your detection model here
- Use the scorers offered by lightly to generate active learning scores.
- """
-
- # Let's assume that you have one active learning score for every image.
- # WARNING: The order of the scores MUST match the order of filenames
- # in the embeddings.csv.
- scores: Iterable[float] = # must be an iterable of floats,
- # e.g. a list of float or a 1d-numpy array
-
- # define the function to add the scores to the embeddings.csv
- def add_al_scores_to_csv(
- input_file_path: str, output_file_path: str,
- scores: Iterable[float], column_name: str = "al_score"
- ):
- with open(input_file_path, 'r') as read_obj:
- with open(output_file_path, 'w') as write_obj:
- csv_reader = csv.reader(read_obj)
- csv_writer = csv.writer(write_obj)
-
- # add the column name
- first_row = next(csv_reader)
- first_row.append(column_name)
- csv_writer.writerow(first_row)
-
- # add the scores
- for row, score in zip(csv_reader, scores):
- row.append(str(score))
- csv_writer.writerow(row)
-
- # use the function
- # adapt the following line to use the correct path to the embeddings.csv
- input_embeddings_csv = '/path/to/output/2021-07-28/12:00:00/data/embeddings.csv'
- output_embeddings_csv = input_embeddings_csv.replace('.csv', '_al.csv')
- add_al_scores_to_csv(input_embeddings_csv, output_embeddings_csv, scores)
-
- print("Use the following path to the embeddings_al.csv in the next step:")
- print(output_embeddings_csv)
-
-Running it will create a terminal output similar to the following:
-
-.. code-block::
-
- (base) user@machine:~/GitHub/playground/docker_with_al$ sudo python3 step_2_add_al_scores.py
- Use the following path to the embedding.csv in the next step:
- /path/to/output/2021-07-28/12:00:00/data/embeddings_al.csv
-
-Your embeddings_al.csv should look similar to this:
-
-+----------------+--------------+--------------+--------------+--------------+---------+-----------+
-| filenames | embedding_0 | embedding_1 | embedding_2 | embedding_3 | labels | al_score |
-+================+==============+==============+==============+==============+=========+===========+
-| cats/0001.jpg | 0.29625183 | 0.50055015 | 0.36491454 | 0.8156051 | 0 | 0.7231 |
-+----------------+--------------+--------------+--------------+--------------+---------+-----------+
-| dogs/0005.jpg | 0.36491454 | 0.29625183 | 0.38491454 | 0.36491454 | 1 | 0.91941 |
-+----------------+--------------+--------------+--------------+--------------+---------+-----------+
-| cats/0014.jpg | 0.8156051 | 0.59055015 | 0.29625183 | 0.50055015 | 0 | 0.01422 |
-+----------------+--------------+--------------+--------------+--------------+---------+-----------+
-
-
-Run Active Learning using the Docker
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-At this point you should have an `embeddings.csv` file with the active learning
-scores in a column named `al_scores`.
-
-We can now perform an active learning iteration using the `coral` selection strategy. In order
-to do the selection on the `embeddings.csv` file we need to make this file
-accessible to the docker. We can do this by using the `shared_dir` feature of the
-docker as described in :ref:`docker-sampling-from-embeddings`.
-
-E.g. use the following bash script.
-
-.. code-block:: bash
-
- #!/bin/bash -e
-
- # Have this in a step_3_run_docker_coral.sh
-
- INPUT_DIR=/path/to/your/dataset/
- SHARED_DIR=/path/to/shared/
- OUTPUT_DIR=/path/to/output/
-
- EMBEDDING_FILE= # insert the path printed in the last step here.
- # e.g. /path/to/output/2021-07-28/12:00:00/data/embeddings_al.csv
-
- cp INPUT_EMBEDDING_FILE SHARED_DIR # copy the embedding file to the shared directory
- EMBEDDINGS_REL_TO_SHARED=embeddings_al.csv
-
-
- LIGHTLY_TOKEN= # put your token here
- N_SAMPLES= # Choose how many samples you want to use here, e.g. 0.1 for 10 percent.
-
- docker run --gpus all --rm -it \
- -v ${INPUT_DIR}:/home/input_dir:ro \
- -v ${SHARED_DIR}:/home/shared_dir:ro \
- -v ${OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=${LIGHTLY_TOKEN} \
- lightly.loader.num_workers=4 \
- stopping_condition.n_samples=${N_SAMPLES}\
- method=coral \
- enable_training=False \
- dump_dataset=True \
- upload_dataset=False \
- embeddings=${EMBEDDINGS_REL_TO_SHARED} \
- active_learning_score_column_name="al_score" \
- scorer=""
-
-Your terminal output should look similar to this:
-
-.. code-block::
-
- [2021-09-29 09:36:27] Loading initial embedding file...
- [2021-09-29 09:36:27] Output images will not be resized.
- [2021-09-29 09:36:27] Found 372 input images in shared_dir/embeddings_al.csv.
- [2021-09-29 09:36:27] Lightly On-Premise License is valid
- [2021-09-29 09:36:28] Removing exact duplicates (disable with remove_exact_duplicates=False).
- [2021-09-29 09:36:28] Found 0 exact duplicates.
- [2021-09-29 09:36:28] Unique embeddings are stored in shared_dir/embeddings_al.csv
- [2021-09-29 09:36:28] Normalizing embeddings to unit length (disable with normalize_embeddings=False).
- [2021-09-29 09:36:28] Normalized embeddings are stored in output_dir/2021-09-29/09:36:27/data/normalized_embeddings.csv
- [2021-09-29 09:36:28] Sampling dataset with stopping condition: n_samples=10
- [2021-09-29 09:36:28] Sampled 10 images.
- [2021-09-29 09:36:28] Writing report to output_dir/2021-09-29/09:36:27/report.pdf.
- [2021-09-29 09:36:56] Writing csv with information about removed samples to output_dir/2021-09-29/09:36:27/removed_samples.csv
- [2021-09-29 09:36:56] Done!
diff --git a/docs/source/docker_archive/advanced/code_examples/active_learning_worker_config.txt b/docs/source/docker_archive/advanced/code_examples/active_learning_worker_config.txt
deleted file mode 100644
index 1f52ad826..000000000
--- a/docs/source/docker_archive/advanced/code_examples/active_learning_worker_config.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-{
- enable_corruptness_check: true,
- remove_exact_duplicates: true,
- enable_training: false,
- pretagging: false,
- pretagging_debug: false,
- method: 'coral',
- stopping_condition: {
- n_samples: 0.1,
- min_distance: -1
- },
- scorer: 'object-frequency',
- scorer_config: {
- frequency_penalty: 0.25,
- min_score: 0.9
- },
- active_learning: {
- task_name: 'my-classification-task',
- score_name: 'uncertainty_margin'
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker_archive/advanced/code_examples/object_level_worker_config.txt b/docs/source/docker_archive/advanced/code_examples/object_level_worker_config.txt
deleted file mode 100644
index a4740f931..000000000
--- a/docs/source/docker_archive/advanced/code_examples/object_level_worker_config.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-{
- object_level: {
- task_name: 'vehicles_object_detections'
- },
- enable_corruptness_check: true,
- remove_exact_duplicates: true,
- enable_training: false,
- pretagging: false,
- pretagging_debug: false,
- method: 'coreset',
- stopping_condition: {
- n_samples: 0.1,
- min_distance: -1
- },
- scorer: 'object-frequency',
- scorer_config: {
- frequency_penalty: 0.25,
- min_score: 0.9
- },
- active_learning: {
- task_name: '',
- score_name: 'uncertainty_margin'
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker_archive/advanced/code_examples/object_level_worker_config_pretagging.txt b/docs/source/docker_archive/advanced/code_examples/object_level_worker_config_pretagging.txt
deleted file mode 100644
index 39ce861cf..000000000
--- a/docs/source/docker_archive/advanced/code_examples/object_level_worker_config_pretagging.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-{
- object_level: {
- task_name: 'lightly_pretagging'
- },
- enable_corruptness_check: true,
- remove_exact_duplicates: true,
- enable_training: false,
- pretagging: true,
- pretagging_debug: false,
- method: 'coreset',
- stopping_condition: {
- n_samples: 0.1,
- min_distance: -1
- },
- scorer: 'object-frequency',
- scorer_config: {
- frequency_penalty: 0.25,
- min_score: 0.9
- },
- active_learning: {
- task_name: '',
- score_name: 'uncertainty_margin'
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker_archive/advanced/code_examples/python_run_active_learning.py b/docs/source/docker_archive/advanced/code_examples/python_run_active_learning.py
deleted file mode 100644
index 9eba6c458..000000000
--- a/docs/source/docker_archive/advanced/code_examples/python_run_active_learning.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import lightly
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="LIGHTLY_TOKEN", dataset_id="DATASET_ID")
-
-# Schedule the docker run with
-# - "active_learning.task_name" set to your task name
-# - "method" set to "coral"
-# All other settings are default values and we show them so you can easily edit
-# the values according to your need.
-client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False,
- "method": "coral",
- "stopping_condition": {"n_samples": 0.1, "min_distance": -1},
- "scorer": "object-frequency",
- "scorer_config": {"frequency_penalty": 0.25, "min_score": 0.9},
- "active_learning": {
- "task_name": "my-classification-task",
- "score_name": "uncertainty_margin",
- },
- },
- lightly_config={
- "loader": {
- "batch_size": 16,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 100, "precision": 32},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.5,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker_archive/advanced/code_examples/python_run_object_level.py b/docs/source/docker_archive/advanced/code_examples/python_run_object_level.py
deleted file mode 100644
index 08b73a1e5..000000000
--- a/docs/source/docker_archive/advanced/code_examples/python_run_object_level.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import lightly
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="LIGHTLY_TOKEN", dataset_id="DATASET_ID")
-
-# Schedule the docker run with the "object_level.task_name" argument set.
-# All other settings are default values and we show them so you can easily edit
-# the values according to your need.
-client.schedule_compute_worker_run(
- worker_config={
- "object_level": {"task_name": "vehicles_object_detections"},
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False,
- "method": "coreset",
- "stopping_condition": {"n_samples": 0.1, "min_distance": -1},
- "scorer": "object-frequency",
- "scorer_config": {"frequency_penalty": 0.25, "min_score": 0.9},
- "active_learning": {"task_name": "", "score_name": "uncertainty_margin"},
- },
- lightly_config={
- "loader": {
- "batch_size": 16,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 100, "precision": 32},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.5,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker_archive/advanced/code_examples/python_run_object_level_pretagging.py b/docs/source/docker_archive/advanced/code_examples/python_run_object_level_pretagging.py
deleted file mode 100644
index 76e237aa8..000000000
--- a/docs/source/docker_archive/advanced/code_examples/python_run_object_level_pretagging.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import lightly
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="LIGHTLY_TOKEN", dataset_id="DATASET_ID")
-
-# Schedule the docker run with the "object_level.task_name" argument set to
-# "lightly_pretagging" and with "pretagging" set to True.
-# All other settings are default values and we show them so you can easily edit
-# the values according to your need.
-client.schedule_compute_worker_run(
- worker_config={
- "object_level": {"task_name": "lightly_pretagging"},
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": True,
- "pretagging_debug": False,
- "method": "coreset",
- "stopping_condition": {"n_samples": 0.1, "min_distance": -1},
- "scorer": "object-frequency",
- "scorer_config": {"frequency_penalty": 0.25, "min_score": 0.9},
- "active_learning": {"task_name": "", "score_name": "uncertainty_margin"},
- },
- lightly_config={
- "loader": {
- "batch_size": 16,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 100, "precision": 32},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.5,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker_archive/advanced/datapool.rst b/docs/source/docker_archive/advanced/datapool.rst
deleted file mode 100644
index 67e6cdde0..000000000
--- a/docs/source/docker_archive/advanced/datapool.rst
+++ /dev/null
@@ -1,107 +0,0 @@
-Datapool
-=================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-The Lightly Datapool is a tool which allows users to incrementally build up a
-dataset for their project. It keeps track of the representations of previously
-selected samples and uses this information to pick new samples in order to
-maximize the quality of the final dataset. It also allows for combining two
-different datasets into one.
-
-- | If you're interested in how the datapool works, go to
- | --> `How It Works`_
-
-- | To see how you can use the datapool, check out
- | --> `Usage`_
-
-
-How It Works
----------------
-
-The Lightly Datapool keeps track of the selected samples in a csv file called
-`datapool_latest.csv`. It contains the filenames of the selected images, their
-embeddings, and their weak labels. Additionally, after training a self-supervised
-model, the datapool contains the checkpoint `checkpoint_latest.ckpt` which was
-used to generate the embeddings.
-
-The datapool is located in the `shared` directory. In general, it is a directory
-with the following structure:
-
-
-.. code-block:: bash
-
- # example of a datapool
- datapool/
- +--- datapool_latest.csv
- +--- checkpoint_latest.ckpt
- +--- history/
-
-The files `datapool_latest.csv` and `checkpoint_latest.csv` are updated after every
-run of the Lightly Docker. The history folder contains the previous versions of
-the datapool. This feature is meant to prevent accidental overrides and can be
-deactivated from the command-line (see `Usage`_ for more information).
-
-Usage
----------------
-
-To **initialize** a datapool, simply pass the name of the datapool as an argument
-to your docker run command and sample from a dataset as always. The Lightly Docker
-will automatically create a datapool directory and populate it with the required
-files.
-
-.. note:: To use the datapool feature, the Lightly Docker requires write access
- to a shared directory. This directory can be passed with the `-v` flag.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- append_weak_labels=False \
- stopping_condition.min_distance=0.1 \
- datapool.name=my_datapool
-
-
-To **append** to your datapool, pass the name of an existing datapool as an argument.
-The Lightly Docker will read the embeddings and filenames from the existing pool and
-consider them during selection. Then, it will update the datapool and checkpoint files.
-
-.. note:: You can't change the dimension of the embeddings once the datapool has
- been initialized so choose carefully!
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {OTHER_INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- append_weak_labels=False \
- stopping_condition.min_distance=0.1 \
- datapool.name=my_datapool
-
-
-To **deactivate automatic archiving** of the past datapool versions, you can pass
-set the flag `keep_history` to False.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- append_weak_labels=False \
- stopping_condition.min_distance=0.1 \
- datapool.name=my_datapool \
- datapool.keep_history=False
diff --git a/docs/source/docker_archive/advanced/datasource_metadata.rst b/docs/source/docker_archive/advanced/datasource_metadata.rst
deleted file mode 100644
index bc4ad9446..000000000
--- a/docs/source/docker_archive/advanced/datasource_metadata.rst
+++ /dev/null
@@ -1,239 +0,0 @@
-.. _ref-docker-datasource-metadata:
-
-Add Metadata to a Datasource
-===============================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Lightly can make use of metadata collected alongside your images or videos. Provided,
-metadata can be used to steer the selection process and to analyze the selected dataset
-in the Lightly Platform.
-
-
-Metadata Folder Structure
-----------------------------
-
-Following, we outline the format in which metadata can be added to a
-Lightly datasource. Everything regarding metadata will take place in a subdirectory
-of your configured datasource called `.lightly/metadata`. The general structure
-of this directory will look like this:
-
-
-.. code-block:: bash
-
- datasource/my_dataset
- + image_1.png
- + image_2.png
- + ...
- + image_N.png
- + .lightly/metadata
- + schema.json
- + image_1.json
- ...
- + image_N.json
-
-
-All of the `.json` files are explained in the next sections.
-
-
-
-
-Metadata Schema
----------------
-The schema defines the format of the metadata and helps the Lightly Platform to correctly identify
-and display different types of metadata.
-
-You can provide this information to Lightly by adding a `schema.json` to the
-`.lightly/metadata` directory. The `schema.json` file must contain a list of
-configuration entries. Each of the entries is a dictionary with the following keys:
-
- - `name`: Identifier of the metadata in the UI.
- - `path`: Concatenation of the keys to access the metadata in a dictionary.
- - `defaultValue`: The fallback value if there is no metadata available.
- - `valueDataType`: One of
-
- - `NUMERIC_INT`
- - `NUMERIC_FLOAT`
- - `CATEGORICAL_INT`
- - `CATEGORICAL_STRING`
- - `CATEGORICAL_BOOLEAN`
-
-
-For example, let's say we have additional information about the scene and weather for each
-of the images we have collected. A possible schema could look like this:
-
-.. code-block:: javascript
- :caption: .lightly/metadata/schema.json
-
- [
- {
- "name": "Scene",
- "path": "scene",
- "defaultValue": "undefined",
- "valueDataType": "CATEGORICAL_STRING"
- }
- {
- "name": "Weather description",
- "path": "weather.description",
- "defaultValue": "nothing",
- "valueDataType": "CATEGORICAL_STRING"
- },
- {
- "name": "Temperature",
- "path": "weather.temperature",
- "defaultValue": 0.0,
- "valueDataType": "NUMERIC_FLOAT"
- },
- {
- "name": "Air pressure",
- "path": "weather.air_pressure",
- "defaultValue": 0,
- "valueDataType": "NUMERIC_INT"
- },
- {
- "name": "Vehicle ID",
- "path": "vehicle_id",
- "defaultValue": 0,
- "valueDataType": "CATEGORICAL_INT"
- }
- ]
-
-
-
-
-Metadata Files
---------------
-Lightly requires a single metadata file per image or video. If an image or video has no corresponding metadata file,
-Lightly assumes the default value from the `schema.json`. If a metadata file is provided for a full video,
-Lightly assumes that the metadata is valid for all frames in that video.
-
-To provide metadata for an image or a video, place a metadata file with the same name
-as the image or video in the `.lightly/metadata` directory but change the file extension to
-`.json`. The file should contain the metadata in the format defined under :ref:`ref-metadata-format`.
-
-
-.. code-block:: bash
-
- # filename of the metadata for file FILENAME.EXT
- .lightly/metadata/${FILENAME}.json
-
- # example: my_image.png
- .lightly/metadata/my_image.json
-
- # example: my_video.mp4
- .lightly/metadata/my_video.json
-
-
-When working with videos it's also possible to provide metadata on a per-frame basis.
-Then, Lightly requires a metadata file per frame. If a frame has no corresponding metadata file,
-Lightly assumes the default value from the `schema.json`. Lightly uses a naming convention to
-identify frames: The filename of a frame consists of the video filename, the frame number
-(padded to the length of the number of frames in the video), the video format separated
-by hyphens. For example, for a video with 200 frames, the frame number will be padded
-to length three. For a video with 1000 frames, the frame number will be padded to length four (99 becomes 0099).
-
-
-.. code-block:: bash
-
- # filename of the metadata of the Xth frame of video FILENAME.EXT
- # with 200 frames (padding: len(str(200)) = 3)
- .lightly/metadata/${FILENAME}-${X:03d}-${EXT}.json
-
- # example: my_video.mp4, frame 99/200
- .lightly/metadata/my_video-099-mp4.json
-
- # example: my_subdir/my_video.mp4, frame 99/200
- .lightly/metadata/my_subdir/my_video-099-mp4.json
-
-
-.. _ref-metadata-format:
-
-Metadata Format
----------------
-
-The metadata json files for images and videos require the keys `file_name`, `type`, and `metadata` key.
-Here, `file_name` serves as a unique identifier to retrieve the original file for which the metadata was collected,
-`type` indicates whether the metadata is per "video", "frame", or "image", and `metadata` contains the actual metadata.
-
-For our example from above, a metadata file corresponding to a image/video/frame should look like this:
-
-
-.. tabs::
-
-
- .. tab:: Video
-
- .. code-block:: javascript
- :caption: .lightly/metadata/my_video.json
-
- {
- "file_name": "my_video.mp4",
- "type": "video",
- "metadata": {
- "scene": "city street",
- "weather": {
- "description": "sunny",
- "temperature": 23.2,
- "air_pressure": 1
- },
- "vehicle_id": 321,
- }
- }
-
- .. tab:: Frame
-
- .. code-block:: javascript
- :caption: .lightly/metadata/my_video-099-mp4.json
-
- {
- "file_name": "my_video-099-mp4.png",
- "type": "frame",
- "metadata": {
- "scene": "city street",
- "weather": {
- "description": "sunny",
- "temperature": 23.2,
- "air_pressure": 1
- },
- "vehicle_id": 321,
- }
- }
-
- .. tab:: Image
-
- .. code-block:: javascript
- :caption: .lightly/metadata/my_image.json
-
- {
- "file_name": "my_image.png",
- "type": "image",
- "metadata": {
- "scene": "highway",
- "weather": {
- "description": "rainy",
- "temperature": 10.5,
- "air_pressure": 1
- },
- "vehicle_id": 321,
- }
- }
-
-
-
-
-Next Steps
-----------
-
-If metadata is provided, the Lightly Worker will automatically detect and load it into
-the Lightly Platform where it can be visualized and analyzed after running a selection.
-
-For example, it's possible to visualize the different categories of metadata in the Lightly
-Platform scatter plot. In the following example we visualized the categorical metadata "Scene"
-from the BDD100k dataset.
-
-
-.. figure:: images/bdd100k_demo_metadata.jpg
diff --git a/docs/source/docker_archive/advanced/datasource_predictions.rst b/docs/source/docker_archive/advanced/datasource_predictions.rst
deleted file mode 100644
index d4dea51c5..000000000
--- a/docs/source/docker_archive/advanced/datasource_predictions.rst
+++ /dev/null
@@ -1,483 +0,0 @@
-.. _ref-docker-datasource-predictions:
-
-Add Predictions to a Datasource
-===============================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Lightly can not only use images you provided in a datasource, but also predictions of a ML model on your images.
-They are used for active learning for selecting images based on the objects in them.
-Furthermore, object detection predictions can be used running Lightly on object level.
-By providing the predictions in the datasource,
-you have full control over them and they scale well to millions of samples.
-Furthermore, if you add new samples to your datasource, you can simultaneously
-add their predictions to the datasource.
-If you already have labels instead of predictions, you can treat them
-just as predictions and upload them the same way.
-
-.. note:: Note that working with predictions requires a minimum
- Lightly Worker of version 2.2. You can check your installed version of the
- Lightly Worker by running the :ref:`ref-docker-setup-sanity-check`.
-
-Predictions Folder Structure
-----------------------------
-
-In the following, we will outline the format of the predictions required by the
-Lightly docker. Everything regarding predictions will take place in a subdirectory
-of your configured datasource called `.lightly/predictions`. The general structure
-of this directory will look like this:
-
-
-.. code-block:: bash
-
- datasource/my_dataset
- + image_1.png
- + image_2.png
- + ...
- + image_N.png
- + .lightly/predictions/
- + tasks.json
- + task_1/
- + schema.json
- + image_1.json
- ...
- + image_N.json
- + task_2/
- + schema.json
- + image_1.json
- ...
- + image_N.json
-
-
-
-Where each subdirectory corresponds to one prediction task (e.g. a classification task
-and an object detection task). All of the files are explained in the next sections.
-
-
-Prediction Tasks
-----------------
-To let Lightly know what kind of prediction tasks you want to work with, Lightly
-needs to know their names. It's very easy to let Lightly know which tasks exist:
-simply add a `tasks.json` in your storage bucket stored at the subdirectory `.lightly/predictions/`.
-
-The `tasks.json` file must include a list of your task names which must match name
-of the subdirectory where your prediction schemas will be located.
-
-.. note::
-
- Only the task names listed within `tasks.json` will be considered.
- Please ensure that the task name corresponds with the location of your prediction schema.
- This allows you to specify which subfolder are considered by the Lightly docker.
-
-For example, let's say we are working with the following folder structure:
-
-.. code-block:: bash
-
- .lightly/predictions/
- + tasks.json
- + classification_weather/
- + schema.json
- ...
- + classification_scenery/
- + schema.json
- ...
- + object_detection_people/
- + schema.json
- ...
- + semantic_segmentation_cars/
- + schema.json
- ...
- + some_directory_containing_irrelevant_things/
-
-
-we can specify which subfolders contain relevant predictions in the `tasks.json`:
-
-.. code-block:: javascript
- :caption: .lightly/predictions/tasks.json
-
- [
- "classification_weather",
- "classification_scenery",
- "object_detection_people",
- "semantic_segmentation_cars",
- ]
-
-.. note::
-
- If you list a subfolder which doesn't contain a valid `schema.json` file,
- the Lightly docker will report an error! See below how to create a good `schema.json` file.
-
-
-Prediction Schema
------------------
-For Lightly it's required to store a prediction schema. The schema defines the
-format of the predictions and helps the Lightly Platform to correctly identify
-and display classes. It also helps to prevent errors as all predictions which
-are loaded are validated against this schema.
-
-Every schema must include the type of the predictions for this task.
-For classification and object detection the prediction schema must also include
-all the categories and their corresponding ids. For other tasks, such as keypoint
-detection, it can be useful to store additional information like which keypoints
-are connected with each other by an edge.
-
-You can provide all this information to Lightly by adding a `schema.json` to the
-directory of the respective task. The schema.json file must have a key `categories`
-with a corresponding list of categories following the COCO annotation format.
-It must also have a key `task_type` indicating the type of the predictions.
-The `task_type` must be one of:
-
- - classification
- - object-detection
- - semantic-segmentation
-
-
-For example, let's say we are working with a classification model predicting the weather on an image.
-The three classes are sunny, clouded, and rainy.
-
-
-.. code-block:: javascript
- :caption: .lightly/predictions/classification_weather/schema.json
-
- {
- "task_type": "classification",
- "categories": [
- {
- "id": 0,
- "name": "sunny"
- },
- {
- "id": 1,
- "name": "clouded"
- },
- {
- "id": 2,
- "name": "rainy"
- }
- ]
- }
-
-
-
-Prediction Files
-----------------
-Lightly requires a **single prediction file per image**. The file should be a .json
-following the format defined under :ref:`ref-prediction-format` and stored in the subdirectory
-`.lightly/predictions/${TASK_NAME}` in the storage bucket the dataset was configured with.
-In order to make sure Lightly can match the predictions to the correct source image,
-it's necessary to follow the naming convention:
-
-.. code-block:: bash
-
- # filename of the prediction for image FILENAME.EXT
- .lightly/predictions/${TASK_NAME}/${FILENAME}.json
-
- # example: my_image.png, classification
- .lightly/predictions/my_classification_task/my_image.json
-
- # example: my_subdir/my_image.png, classification
- .lightly/predictions/my_classification_task/my_subdir/my_image.json
-
-
-Prediction Files for Videos
----------------------------
-When working with videos, Lightly requires a prediction file per frame. Lightly
-uses a naming convention to identify frames: The filename of a frame consists of
-the video filename, the video format, and the frame number (padded to the length
-of the number of frames in the video) separated by hyphens. For example, for a
-video with 200 frames, the frame number will be padded to length three. For a video
-with 1000 frames, the frame number will be padded to length four (99 becomes 0099).
-
-.. code-block:: bash
-
- # filename of the predictions of the Xth frame of video FILENAME.EXT
- # with 200 frames (padding: len(str(200)) = 3)
- .lightly/predictions/${TASK_NAME}/${FILENAME}-${X:03d}-${EXT}.json
-
- # example: my_video.mp4, frame 99/200
- .lightly/predictions/my_classification_task/my_video-099-mp4.json
-
- # example: my_subdir/my_video.mp4, frame 99/200
- .lightly/predictions/my_classification_task/my_subdir/my_video-099-mp4.json
-
-
-.. _ref-prediction-format:
-
-Prediction Format
------------------
-Predictions for an image must have a `file_name` and `predictions`.
-Here, `file_name` serves as a unique identifier to retrieve the image for which
-the predictions are made and predictions is a list of `Prediction Singletons` for the corresponding task.
-
-Example classification:
-
-.. code-block:: javascript
- :caption: .lightly/predictions/classification_weather/my_image.json
-
- {
- "file_name": "my_image.png",
- "predictions": [ // classes: [sunny, clouded, rainy]
- {
- "category_id": 0,
- "probabilities": [0.8, 0.1, 0.1]
- }
- ]
- }
-
-Example object detection:
-
-.. code-block:: javascript
- :caption: .lightly/predictions/object_detection/my_image.json
-
- {
- "file_name": "my_image.png",
- "predictions": [ // classes: [person, car]
- {
- "category_id": 0,
- "bbox": [140, 100, 80, 90], // x, y, w, h coordinates in pixels
- "score": 0.8
- },
- {
- "category_id": 1,
- "bbox": [...],
- "score": 0.9
- },
- {
- "category_id": 0,
- "bbox": [...],
- "score": 0.5
- }
- ]
- }
-
-Example semantic segmentation:
-
-.. code-block:: javascript
- :caption: .lightly/predictions/semantic_segmentation_cars/my_image.json
-
- {
- "file_name": "my_image.png",
- "predictions": [ // classes: [background, car]
- {
- "category_id": 0,
- "segmentation": [100, 80, 90, 85, ...], //run length encoded binary segmentation mask
- "score": 0.8
- },
- {
- "category_id": 1,
- "segmentation": [...],
- "score": 0.9
- },
- ]
- }
-
-Note: The filename should always be the full path from the root directory.
-
-
-Prediction Singletons
----------------------
-The prediction singletons closely follow the `COCO results `_ format while dropping
-the `image_id`. Note the the `category_id` must be the same as the one defined
-in the schema and that the probabilities (if provided) must follow the order of the category ids.
-
-**Classification:**
-
-For classification, please use the following format:
-
-.. code-block:: javascript
-
- [{
- "category_id" : int,
- "probabilities" : [p0, p1, ..., pN] // optional, sum up to 1.0
- }]
-
-**Object Detection:**
-
-For detection with bounding boxes, please use the following format:
-
-.. code-block:: javascript
-
- [{
- "category_id" : int,
- "bbox" : [x, y, width, height], // coordinates in pixels
- "score" : float,
- "probabilities" : [p0, p1, ..., pN] // optional, sum up to 1.0
- }]
-
-The bounding box format follows the `COCO results `_ documentation.
-
-.. note::
-
- Bounding Box coordinates are pixels measured from the top left image corner.
-
-**Semantic Segmentation:**
-
-For semantic segmentation, please use the following format:
-
-.. code-block:: javascript
-
- [{
- "category_id" : int,
- "segmentation" : [int, int, ...], // run length encoded binary segmentation mask
- "score" : float,
- "probabilities" : [p0, p1, ..., pN] // optional, sum up to 1.0
- }]
-
-Each segmentation prediction contains the binary mask for one category and a
-corresponding score. The score determines the likelihood of the segmentation
-belonging to that category. Optionally, a list of probabilities can be provided
-containing a probability for each category, indicating the likeliness that the
-segment belongs to that category.
-
-Segmentations are defined with binary masks where each pixel is either set to 0
-or 1 if it belongs to the background or the object, respectively.
-The segmentation masks are compressed using run length encoding to reduce file size.
-Binary segmentation masks can be converted to the required format using the
-following function:
-
-.. code-block:: python
-
- import numpy as np
-
- def encode(binary_mask):
- """Encodes a (H, W) binary segmentation mask with run length encoding.
-
- The run length encoding is an array with counts of subsequent 0s and 1s
- in the binary mask. The first value in the array is always the count of
- initial 0s.
-
- Examples:
-
- >>> binary_mask = [
- >>> [0, 0, 1, 1],
- >>> [0, 1, 1, 1],
- >>> [0, 0, 0, 1],
- >>> ]
- >>> encode(binary_mask)
- [2, 2, 1, 3, 3, 1]
- """
- flat = np.concatenate(([-1], np.ravel(binary_mask), [-1]))
- borders = np.nonzero(np.diff(flat))[0]
- rle = np.diff(borders)
- if flat[1]:
- rle = np.concatenate(([0], rle))
- return rle.tolist()
-
-Segmentation models oftentimes output a probability for each pixel and category.
-Storing such probabilities can quickly result in large file sizes if the input
-images have a high resolution. To reduce storage requirements, Lightly expects
-only a single score or probability per segmentation. If you have scores or
-probabilities for each pixel in the image, you have to first aggregate them
-into a single score/probability. We recommend to take either the median or mean
-score/probability over all pixels within the segmentation mask. The example
-below shows how pixelwise segmentation predictions can be converted to the
-format required by Lightly.
-
-.. code-block:: python
-
- # Make prediction for a single image. The output is assumed to be a tensor
- # with shape (categories, height, width).
- segmentation = model(image)
-
- # Most probable object category per pixel.
- category = segmentation.argmax(dim=0)
-
- # Convert to lightly predictions.
- predictions = []
- for category_id in category.unique():
- binary_mask = category == category_id
- median_score = segmentation[category_id, binary_mask].median()
- predictions.append({
- 'category_id': int(category_id),
- 'segmentation': encode(binary_mask),
- 'score': float(median_score),
- })
-
- prediction = {
- 'file_name': 'image_name.png',
- 'predictions': predictions,
- }
-
-
-.. note::
-
- Support for keypoint detection is coming soon!
-
-
-
-Creating the predictions folder
--------------------------------
-
-For creating the predictions folder, we recommend writing a script that takes your predictions and
-saves them in the format just outlined. You can either save the predictions first on your local machine
-and then upload them to your datasource or save them directly to your datasource.
-
-As an example, the following script takes an object detection `COCO predictions file `_.
-It needs the path to the predictions file and the output directory
-where the `.lightly` folder should be created as input.
-Don't forget to change these 2 parameters at the top of the script.
-
-.. code-block:: python
-
- ### CHANGE THESE PARAMETERS
- output_filepath = "/path/to/create/.lightly/dir"
- annotation_filepath = "/path/to/_annotations.coco.json"
-
- ### Optionally change these parameters
- task_name = "my_object_detection_task"
- task_type = "object-detection"
-
- import json
- import os
- from pathlib import Path
-
- # create prediction directory
- path_predictions = os.path.join(output_filepath, '.lightly/predictions')
- Path(path_predictions).mkdir(exist_ok=True, parents=True)
-
- # Create task.json
- path_task_json = os.path.join(path_predictions, 'tasks.json')
- tasks = [task_name]
- with open(path_task_json, 'w') as f:
- json.dump(tasks, f)
-
- # read coco annotations
- with open(annotation_filepath, 'r') as f:
- coco_dict = json.load(f)
-
- # Create schema.json for task
- path_predictions_task = os.path.join(path_predictions, tasks[0])
- Path(path_predictions_task).mkdir(exist_ok=True)
- schema = {
- "task_type": task_type,
- "categories": coco_dict['categories']
- }
- path_schema_json = os.path.join(path_predictions_task, 'schema.json')
- with open(path_schema_json, 'w') as f:
- json.dump(schema, f)
-
- # Create predictions themselves
- image_id_to_prediction = dict()
- for image in coco_dict['images']:
- prediction = {
- 'file_name': image['file_name'],
- 'predictions': [],
- }
- image_id_to_prediction[image['id']] = prediction
- for ann in coco_dict['annotations']:
- pred = {
- 'category_id': ann['category_id'],
- 'bbox': ann['bbox'],
- 'score': ann.get('score', 0)
- }
- image_id_to_prediction[ann['image_id']]['predictions'].append(pred)
-
- for prediction in image_id_to_prediction.values():
- filename_prediction = os.path.splitext(prediction['file_name'])[0] + '.json'
- path_to_prediction = os.path.join(path_predictions_task, filename_prediction)
- with open(path_to_prediction, 'w') as f:
- json.dump(prediction, f)
diff --git a/docs/source/docker_archive/advanced/images/bdd100k_demo_metadata.jpg b/docs/source/docker_archive/advanced/images/bdd100k_demo_metadata.jpg
deleted file mode 100644
index 552140d0d..000000000
Binary files a/docs/source/docker_archive/advanced/images/bdd100k_demo_metadata.jpg and /dev/null differ
diff --git a/docs/source/docker_archive/advanced/images/object_level_vehicle_car_cluster.jpg b/docs/source/docker_archive/advanced/images/object_level_vehicle_car_cluster.jpg
deleted file mode 100644
index ced2c818f..000000000
Binary files a/docs/source/docker_archive/advanced/images/object_level_vehicle_car_cluster.jpg and /dev/null differ
diff --git a/docs/source/docker_archive/advanced/images/object_level_vehicle_crops_examples.jpg b/docs/source/docker_archive/advanced/images/object_level_vehicle_crops_examples.jpg
deleted file mode 100644
index 3acbdb83d..000000000
Binary files a/docs/source/docker_archive/advanced/images/object_level_vehicle_crops_examples.jpg and /dev/null differ
diff --git a/docs/source/docker_archive/advanced/images/object_level_vehicle_examples.jpg b/docs/source/docker_archive/advanced/images/object_level_vehicle_examples.jpg
deleted file mode 100644
index ee61b2b9e..000000000
Binary files a/docs/source/docker_archive/advanced/images/object_level_vehicle_examples.jpg and /dev/null differ
diff --git a/docs/source/docker_archive/advanced/images/object_level_vehicle_motorbike_cluster.jpg b/docs/source/docker_archive/advanced/images/object_level_vehicle_motorbike_cluster.jpg
deleted file mode 100644
index a38ba6fb1..000000000
Binary files a/docs/source/docker_archive/advanced/images/object_level_vehicle_motorbike_cluster.jpg and /dev/null differ
diff --git a/docs/source/docker_archive/advanced/images/object_level_vehicle_truck_cluster.jpg b/docs/source/docker_archive/advanced/images/object_level_vehicle_truck_cluster.jpg
deleted file mode 100644
index 06cbc9f6d..000000000
Binary files a/docs/source/docker_archive/advanced/images/object_level_vehicle_truck_cluster.jpg and /dev/null differ
diff --git a/docs/source/docker_archive/advanced/images/sequence_selection_pca.png b/docs/source/docker_archive/advanced/images/sequence_selection_pca.png
deleted file mode 100644
index c2d222ceb..000000000
Binary files a/docs/source/docker_archive/advanced/images/sequence_selection_pca.png and /dev/null differ
diff --git a/docs/source/docker_archive/advanced/meta_information.rst b/docs/source/docker_archive/advanced/meta_information.rst
deleted file mode 100644
index 418a92704..000000000
--- a/docs/source/docker_archive/advanced/meta_information.rst
+++ /dev/null
@@ -1,105 +0,0 @@
-
-.. _ref-docker-meta-information:
-
-Meta Information
-======================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Depending on your current setup one of the following topics might interest you:
-
-- | You have a dataset but want lightly to "ignore" certain Samples.
- | --> `Mask Samples`_
-
-- | You have an existing dataset and want to add only relevant new data.
- | --> `Use Pre-Selected Samples`_
-
-- | You have your own (weak) labels. Can lightly use this information to improve
- the selection?
- | --> `Custom Weak Labels`_
-
-
-Mask Samples
------------------------------------
-
-You can also add masking information to prevent certain samples from being
-used to the .csv file.
-
-The following example shows a dataset in which the column "masked" is used
-to prevent Lightly Docker from using this specific sample. In this example,
-img-1.jpg is simply ignored and not considered for selection. E.g. the sample
-neither gets selected nor is it affecting the selection of any other sample.
-
-.. list-table:: masked_embeddings.csv
- :widths: 50 50 50 50 50
- :header-rows: 1
-
- * - filenames
- - embedding_0
- - embedding_1
- - masked
- - labels
- * - img-1.jpg
- - 0.1
- - 0.5
- - 1
- - 0
- * - img-2.jpg
- - 0.2
- - 0.2
- - 0
- - 0
- * - img-3.jpg
- - 0.1
- - 0.9
- - 0
- - 0
-
-
-Use Pre-Selected Samples
------------------------------------
-Very similar to masking samples we can also pre-select specific samples. This
-can be useful for semi-automated data selection processes. A human annotator
-can pre-select some of the relevant samples and let Lightly Docker add only
-additional samples that are enriching the existing selection.
-
-
-.. list-table:: selected_embeddings.csv
- :widths: 50 50 50 50 50
- :header-rows: 1
-
- * - filenames
- - embedding_0
- - embedding_1
- - selected
- - labels
- * - img-1.jpg
- - 0.1
- - 0.5
- - 0
- - 0
- * - img-2.jpg
- - 0.2
- - 0.2
- - 0
- - 0
- * - img-3.jpg
- - 0.1
- - 0.9
- - 1
- - 0
-
-.. note:: Pre-selected samples also count for the target number of samples.
- For example, you have a dataset with 100 samples. If you have preselected
- 60 samples and want to select another 10,
- you have to set the target number of samples to 70.
-
-Custom Weak Labels
------------------------------------
-
-You can always add custom embeddings to the dataset by following the guide
-here: :ref:`lightly-custom-labels`
\ No newline at end of file
diff --git a/docs/source/docker_archive/advanced/object_level.rst b/docs/source/docker_archive/advanced/object_level.rst
deleted file mode 100644
index 42dd1755a..000000000
--- a/docs/source/docker_archive/advanced/object_level.rst
+++ /dev/null
@@ -1,297 +0,0 @@
-.. _ref-docker-object-level:
-
-Object Level
-============
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Lightly does not only work on full images but also on an object level. This
-workflow is especially useful for datasets containing small objects or multiple
-objects in each image and provides the following benefits over the full image
-workflow:
-
-- Analyze a dataset based on individual objects
-- Find a diverse set of objects in the dataset
-- Find images that contain objects of interest
-- Full control over type of objects to process
-- Ignore uninteresting background regions in images
-- Automatic cropping of objects from the original image
-
-.. note:: Note that the object level features require a minimum
- Lightly Worker of version 2.2. You can check your installed version of the
- Lightly Worker by running the :ref:`ref-docker-setup-sanity-check`.
-
-
-Prerequisites
--------------
-In order to use the object level workflow with Lightly, you will need the
-following things:
-
-- The installed Lightly docker (see :ref:`ref-docker-setup`)
-- A dataset with a configured datasource (see :ref:`ref-docker-with-datasource-datapool`)
-- Object detection predictions uploaded to the datasource (see next section)
-
-
-.. note::
-
- If you don't have any predictions available, you can use the Lightly pretagging
- model. See :ref:`Pretagging ` for more information.
-
-
-Predictions
------------
-Lightly needs to know which objects to process. This information is provided
-by uploading a set of object predictions to the datasource (see :ref:`ref-docker-datasource-predictions`).
-Let's say we are working with a dataset containing different types of vehicles
-and used an object detection model to find possible vehicle objects in the
-dataset. Then the file structure of the datasource should look like this:
-
-.. code-block:: bash
-
- datasource/vehicles_dataset/
- + .lightly/predictions/
- + tasks.json
- + vehicles_object_detections/
- + schema.json
- + image_1.json
- ...
- + image_N.json
- + image_1.png
- + image_2.png
- ...
- + image_N.png
-
-
-The following files should be added to the *.lightly/predictions*
-directory in the datasource:
-
-- A *tasks.json* file that contains the name of the subdirectory in which the
- prediction files are stored.
-
- .. code-block::
- :caption: .lightly/predictions/tasks.json
-
- [
- "vehicles_object_detections"
- ]
-
-- A *schema.json* file that specifies that the predictions are from an
- *object-detection* task and a list of all possible object categories.
-
- .. code-block:: javascript
- :caption: .lightly/predictions/vehicles_object_detections/schema.json
-
- {
- "task_type": "object-detection",
- "categories": [
- {
- "id": 0,
- "name": "car",
- },
- {
- "id": 1,
- "name": "truck",
- },
- {
- "id": 2,
- "name": "motorbike",
- }
- ]
- }
-
-- And for each image, or video frame, in the dataset an *IMAGE_NAME.json* file
- which holds the predictions the object detection model made for the given image:
-
- .. code-block:: javascript
- :caption: .lightly/predictions/vehicles_object_detections/image_1.json
-
- {
- "file_name": "image_1.png",
- "predictions": [
- {
- "category_id": 1,
- "bbox": [...],
- "score": 0.8
- },
- {
- "category_id": 0,
- "bbox": [...],
- "score": 0.9
- },
- {
- "category_id": 2,
- "bbox": [...],
- "score": 0.5
- }
- ]
- }
-
-For more information regarding the predictions format please see :ref:`ref-docker-datasource-predictions`.
-
-
-Selection on Object Level
--------------------------
-Once you have everything set up as described above, you can run selection on
-object level by setting the `object_level.task_name` argument in the :ref:`docker configuration `.
-The argument should be set to the task name you used for your predictions.
-If you uploaded the predictions to e.g. `.lightly/predictions/vehicles_object_detections`
-then you should set `object_level.task_name` to `vehicles_object_detections`.
-
-The object level job can either be scheduled from the Lightly Web App or
-from python code. Examples on how to schedule the job are provided below.
-
-.. tabs::
-
- .. tab:: Web App
-
- **Trigger the Job**
-
- To trigger a new job you can click on the schedule run button on the dataset
- overview as shown in the screenshot below:
-
- .. figure:: ../integration/images/schedule-compute-run.png
-
- After clicking on the button you will see a wizard to configure the parameters
- for the job.
-
- .. figure:: ../integration/images/schedule-compute-run-config.png
-
- In this example we have to set the `object_level.task_name` parameter
- in the docker config, all other settings are default values. The
- resulting docker config should look like this:
-
- .. literalinclude:: code_examples/object_level_worker_config.txt
- :caption: Docker Config
- :language: javascript
-
- The Lightly config remains unchanged.
-
- .. tab:: Python Code
-
- .. literalinclude:: code_examples/python_run_object_level.py
-
-
-.. _object-level-pretagging:
-
-Lightly Pretagging
-------------------
-Instead of providing your own predictions, it's also possible to use the built-in pretagging model from Lightly. To do so,
-set `pretagging=True` in your config and use the `object_level.task_name="lightly_pretagging"`. For more information
-about the prediction model and classes, go to :ref:`Lightly Pretagging Model `
-
-.. tabs::
-
- .. tab:: Web App
-
- .. literalinclude:: code_examples/object_level_worker_config_pretagging.txt
- :caption: Docker Config
- :language: javascript
-
- The Lightly config remains unchanged.
-
- .. tab:: Python Code
-
- .. literalinclude:: code_examples/python_run_object_level_pretagging.py
-
-
-Padding
--------
-Lightly makes it possible to add a padding around your bounding boxes. This allows
-for better visualization of the cropped images in the web-app and can improve the
-embeddings of the objects as the embedding model sees the objects in context. To add
-padding, simply specify `object_level.padding=X` where `X` is the padding relative
-to the bounding box size. For example, a padding of `X=0.1` will extend both width and
-height of all bounding boxes by 10 percent.
-
-
-Object Crops Dataset
---------------------
-Once the docker job is started it fetches all images and predictions from the
-remote datasource and processes them. For each prediction, the docker crops
-the object from the full image and creates an embedding for it. Then it selects
-a subset of the objects and uploads **two** datasets to the Lightly Platform:
-
-1. The crops and embeddings of the selected objects are uploaded to an object
- *crops* dataset on the platform. By default, the dataset has the same name as
- the original image dataset but with a "-crops" suffix appended to it.
- Alternatively, you can also choose a custom dataset name by setting
- the `object_level.crop_dataset_name` config option.
-2. If an object is selected, then the full image containing that object is
- also uploaded. You can find these images in the original dataset from which
- you started the selection job.
-
-You can see example images of the two datasets below.
-
-Object Crop Dataset:
-
-.. figure:: images/object_level_vehicle_crops_examples.jpg
-
-
-Original Full Image Dataset:
-
-.. figure:: images/object_level_vehicle_examples.jpg
-
-
-Analyzing the Crop Dataset
---------------------------
-The crop dataset allows you to analyze your data on an object level. In our
-vehicles dataset we could, for example, be interested in the diversity of the
-vehicles. If we go to our crops dataset and select the *Embedding* view in the
-menu, we can see that crops are roughly grouped by vehicle type:
-
-Cars:
-
-.. figure:: images/object_level_vehicle_car_cluster.jpg
-
-Trucks:
-
-.. figure:: images/object_level_vehicle_truck_cluster.jpg
-
-Motorbikes:
-
-.. figure:: images/object_level_vehicle_motorbike_cluster.jpg
-
-
-This can be a very efficient way to get insights into your data without the need
-for human annotations. The embedding view allows you dig deeper into the
-properties of your dataset and reveal things like:
-
-- Q: What sort of special trucks do we have?
- A: There are a lot of ambulances and school buses.
-- Q: Are there also vans in the dataset?
- A: There are only few of them, we should try to get more images containing vans.
-- Q: Are there images of cars in different weather conditions?
- A: Most images seem to be taken in sunny weather with good lightning conditions.
-
-These hidden biases are hard to find in a dataset if you only rely on full
-images or the coarse vehicle type predicted by the object detection model.
-Lightly helps you to identify them quickly and assists you in monitoring and
-improving the quality of your dataset. After an initial exploration you can now
-take further steps to enhance the dataset using one of the workflows Lightly
-provides:
-
-- Select a subset of your data using our :ref:`Sampling Algorithms `
-- Select new samples to add to your dataset using :ref:`Active Learning `
-- Prepare images for labelling by `exporting them to LabelStudio `_
-
-Multiple Object Level Runs
---------------------------
-You can run multiple object level workflows using the same dataset. To start a
-new run, please select your original full image dataset in the Lightly Web App
-and schedule a new run from there. If you are running the docker from Python or
-over the API, you have to set the `dataset_id` configuration option to the id of
-the original full image dataset. In both cases make sure that the run is *not*
-started from the crops dataset as this is not supported!
-
-You can control to which crops dataset the newly selected object crops are
-uploaded by setting the `object_level.crop_dataset_name` configuration option.
-By default this option is not set and if you did not specify it in the first run,
-you can also omit it in future runs. In this case Lightly will automatically
-find the existing crops dataset and add the new crops to it. If you want to
-upload the crops to a new dataset or have set a custom crop dataset name in a
-previous run, then set the `object_level.crop_dataset_name` option to a new
-or existing dataset name, respectively.
diff --git a/docs/source/docker_archive/advanced/overview.rst b/docs/source/docker_archive/advanced/overview.rst
deleted file mode 100644
index 2cb216d7c..000000000
--- a/docs/source/docker_archive/advanced/overview.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-Advanced
-===================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Here you learn more advanced usage patterns of Lightly Docker.
-
-
-.. toctree::
- :maxdepth: 2
-
- meta_information.rst
- datapool.rst
- pretagging.rst
- datasource_predictions.rst
- datasource_metadata.rst
- active_learning.rst
- sequence_selection.rst
- object_level.rst
diff --git a/docs/source/docker_archive/advanced/pretagging.rst b/docs/source/docker_archive/advanced/pretagging.rst
deleted file mode 100644
index c57c4bb09..000000000
--- a/docs/source/docker_archive/advanced/pretagging.rst
+++ /dev/null
@@ -1,146 +0,0 @@
-.. _ref-docker-pretagging:
-
-Pretagging
-======================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Lightly Docker supports the use of pre-trained models to tag the dataset. We
-call this pretagging. For now, we offer a pre-trained model for object detection
-optimized for autonomous-driving.
-
-Using a pretrained model does not resolve the need for high quality human annotations.
-However, we can use the model predictions to get some idea of the underlying
-distribution within the dataset.
-
-The model is capable of detecting the following core classes:
-
- - bicycle
- - bus
- - car
- - motorcycle
- - person
- - train
- - truck
-
-
-How It Works
----------------
-
-Our pretagging model is based on a FasterRCNN model with a ResNet-50 backbone.
-The model has been trained on a dataset consisting of ~100k images.
-
-The results of pretagging are visualized in the report. We report both, the
-object distribution before and after the selection process.
-
-The following image shows an example of such a histogram for the input data
-before filtering.
-
-.. figure:: ../resources/pretagging_histogram_example.png
- :align: center
- :alt: some alt text
-
- Histogram plot of the pretagging model for the input data (full dataset).
- The plot shows the distribution of the various detected classes.
- Further it shows the average number of objects per image.
-
-For every docker run with pretagging enabled we also dump all model predictions
-into a json file with the following format:
-
-.. code-block:: javascript
-
- // boxes have format x1, y1, x2, y2
- [
- {
- "filename": "0000000095.png",
- "boxes": [
- [
- 0.869,
- 0.153,
- 0.885,
- 0.197
- ],
- [
- 0.231,
- 0.175,
- 0.291,
- 0.202
- ]
- ],
- "labels": [
- "person",
- "car"
- ],
- "scores": [
- 0.9845203757286072,
- 0.9323102831840515
- ]
- },
- ...
- ]
-
-
-Usage
----------------
-
-Pretagging can be activated by passing the following argument to your docker
-run command: `pretagging=True`
-
-- `pretagging=True` enables the use of the pretagging model
-- `pretagging_debug=True` add a few images to the report for debugging showing the image with the bounding box predictions.
-- `pretagging_upload=True` enables uploading of the predictions to a configured datasource.
-
-
-The final docker run command to enable pretagging as well as pretagging_debug
-should look like this:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- pretagging=True \
- pretagging_debug=True
-
-The following shows an example of how the debugging images in the report look like:
-
-.. figure:: ../resources/pretagging_debug_example.png
- :align: center
- :alt: some alt text
-
- The plot shows the detected bounding boxes from the pretagging overlayed
- on the image. Use the debug feature to figure out whether the pretagging
- mechanism works properly on your dataset.
-
-
-Pretagging for Selection
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-You can also use pretagging to guide the data selection process. This can be
-helpful if you for example only care about images where there is at least one
-person and more than one car.
-
-To create such a pretagging selection mechanism you need to create a config file.
-
-For the example of selecting only images with >=1 person and >=2 cars we can
-create a `min_requirements.json` file like this:
-
-.. code-block:: json
-
- {
- "person": 1,
- "car": 2
- }
-
-Move this file to the shared directory (to make it accessible to the docker
-container).
-Finally, run the docker with `pretagging=True`
-and `pretagging_config=min_requirements.json`.
-Only images satisfying all declared requirements will be selected.
diff --git a/docs/source/docker_archive/advanced/sequence_selection.rst b/docs/source/docker_archive/advanced/sequence_selection.rst
deleted file mode 100644
index 43fc8d5c8..000000000
--- a/docs/source/docker_archive/advanced/sequence_selection.rst
+++ /dev/null
@@ -1,84 +0,0 @@
-Sequence Selection
-==================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Sequence selection allows users to select sequences of a video instead of single frames.
-The key concept is the parameter `selected_sequence_length`. If its value is one (default),
-the docker selects single frames. If it is larger than one, each video is split into
-sequences of that length and the frame representations are aggregated into a sequence
-representation. The selection then happens on these sequence representations.
-
-.. note:: Sequence selection works on videos or on folders of alphabetically sorted
- frames.
-
-
-How It Works
--------------
-Sequence selection consists of the following steps:
-
-1. Each input video is split into sequences of length `selected_sequence_length`.
-2. Next, the embeddings of all frames in a sequence are aggregated (averaged).
-3. The selection is performed on sequence level.
-4. Finally, the indices of the selected sequence frames are reconstructed.
-5. The report is generated and (if requested) the selected frames are saved.
-
-
-Usage
------------
-
-To select sequences of length **X** simply add the argument **selected_sequence_length=X**
-to your docker run command. Hereby, **X** must be an integer number which evenly divides
-the **stopping_condition.n_samples**. If **stopping_condition.n_samples** is a fraction,
-the Lightly docker will automatically round it to the next multiple of of **X**.
-
-
-For example, let's say we have a folder with two videos
-which we randomly downloaded from `Pexels `_:
-
-.. code-block:: console
-
- ls /datasets/pexels/
- > Pexels Videos 1409899.mp4 Pexels Videos 2495382.mp4
-
-Now, we want to select sequences of length ten. We use:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v /datasets/pexels:/home/input_dir:ro \
- -v /outputs/:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- stopping_condition.n_samples=200 \
- enable_corruptness_check=False \
- remove_exact_duplicates=False \
- dump_dataset=True \
- selected_sequence_length=10
-
-The above command will select 20 sequences each consisting of ten frames. The selected
-frames are then saved in the output directory for further processing. Note that Lightly
-docker currently doesn't support the corruptness check and removing exact duplicates for
-sequence selection. Hence we have to deactivate them in the command above.
-
-
-.. warning:: The stopping condition `n_samples` must be equal to to the number of
- desired sequences times the `selected_sequence_length`, i.e. **n_samples = n_sequences x selected_sequence_length**.
- In the example above 20 sequences times ten frames is exactly 200.
-
-
-In our example, a look at a PCA of the embeddings of the selected frames nicely shows
-the 20 selected sequences. The following image is taken from the output of the Lightly
-docker:
-
-.. figure:: images/sequence_selection_pca.png
- :align: center
- :alt: PCA of embeddings of frames
- :figwidth: 80%
-
- PCA of the embeddings of the frames in the selected sequences from the two
- input videos (yellow and purple).
\ No newline at end of file
diff --git a/docs/source/docker_archive/code_examples/webapp_default_lightly_config.txt b/docs/source/docker_archive/code_examples/webapp_default_lightly_config.txt
deleted file mode 100644
index daefb7809..000000000
--- a/docs/source/docker_archive/code_examples/webapp_default_lightly_config.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-{
- loader: {
- batch_size: 16,
- shuffle: true,
- num_workers: -1,
- drop_last: true
- },
- model: {
- name: 'resnet-18',
- out_dim: 128,
- num_ftrs: 32,
- width: 1
- },
- trainer: {
- gpus: 1,
- max_epochs: 100,
- precision: 32
- },
- criterion: {
- temperature: 0.5
- },
- optimizer: {
- lr: 1,
- weight_decay: 0.00001
- },
- collate: {
- input_size: 64,
- cj_prob: 0.8,
- cj_bright: 0.7,
- cj_contrast: 0.7,
- cj_sat: 0.7,
- cj_hue: 0.2,
- min_scale: 0.15,
- random_gray_scale: 0.2,
- gaussian_blur: 0.5,
- kernel_size: 0.1,
- vf_prob: 0,
- hf_prob: 0.5,
- rr_prob: 0
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker_archive/code_examples/webapp_default_worker_config.txt b/docs/source/docker_archive/code_examples/webapp_default_worker_config.txt
deleted file mode 100644
index 28c29ad62..000000000
--- a/docs/source/docker_archive/code_examples/webapp_default_worker_config.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-{
- enable_corruptness_check: true,
- remove_exact_duplicates: true,
- enable_training: false,
- pretagging: false,
- pretagging_debug: false,
- method: 'coreset',
- stopping_condition: {
- n_samples: 0.1,
- min_distance: -1
- },
- scorer: 'object-frequency',
- scorer_config: {
- frequency_penalty: 0.25,
- min_score: 0.9
- },
- active_learning: {
- task_name: '',
- score_name: 'uncertainty_margin'
- }
-}
\ No newline at end of file
diff --git a/docs/source/docker_archive/configuration/configuration.rst b/docs/source/docker_archive/configuration/configuration.rst
deleted file mode 100644
index 59c297a71..000000000
--- a/docs/source/docker_archive/configuration/configuration.rst
+++ /dev/null
@@ -1,260 +0,0 @@
-.. _ref-docker-configuration:
-
-Configuration
-===================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-As the Lightly\ **SSL** framework the docker solution can be configured using Hydra.
-
-The example below shows how the `token` parameter can be set when running the docker container.
-
-.. code-block:: console
-
- docker run --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/shared_dir \
- --ipc="host" --network="host" \
- lightly/worker:latest \
- token=MYAWESOMETOKEN
-
-Note that `token` can optionally be passed as a `LIGHTLY_TOKEN` environment variable to keep it hidden from logs:
-
-.. code-block:: console
-
- docker run --rm -it \
- -e LIGHTLY_TOKEN=MYAWESOMETOKEN
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/shared_dir \
- --ipc="host" --network="host" \
- lightly/worker:latest
-
-.. _rst-docker-parameters:
-
-List of Parameters
------------------------------------
-
-The following are parameters which can be passed to the container:
-
-.. code-block:: yaml
-
- # Access token, get it from app.lightly.ai.
- token: ''
-
- worker:
- # If specified, the docker is started as a worker on the Lightly platform.
- worker_id: ''
- # If True, the worker notifies that it is online even though another worker
- # with the same worker_id is already online.
- # This can be useful if that worker is actually offline but was not able to properly shutdown due to it crashing.
- # If False and already a worker exists, the docker aborts.
- force_start: True
-
- # Set to True to check whether installation was successful.
- sanity_check: False
-
- # Path to a file containing filenames to run the docker on a subset of the
- # files in the input directory. The docker will ignore all files in the input
- # directory not listed here. Each filename must be on a separate line and
- # relative to the input directory.
- # If you use a cloud bucket as datasource, the path is relative
- # to the root of your input datasource. If you specified a 2nd output datasource,
- # and the path contains `.lightly`, then the path is relative to the output datasource.
- # If you have a local input directory, the path is relative to the shared directory,
- # so if the file is in 'shared_dir/directory/relevant_filenames.txt'
- # the path should be set to 'directory/relevant_filenames.txt'
- relevant_filenames_file: ''
-
- # Set to False to disable check for corrupted images.
- enable_corruptness_check: True
- corruptness_check:
- # Threshold in [0, 1] which determines the sensibility of the corruptness check
- # for video frames. Every frame which has an internally computed corruptness
- # score larger than the specified threshold will be classified as corrupted.
- corruption_threshold: 0.1
-
- # Remove exact duplicates.
- remove_exact_duplicates: True
-
- # Path to the checkpoint relative to the shared directory.
- checkpoint: ''
-
- # Path to the embeddings file relative to the shared directory.
- embeddings: ''
-
- # Enable training, only possible when no embeddings are passed.
- enable_training: False
-
- # Dump the final dataset to the output directory.
- dump_dataset: False
- dump_sampled_embeddings: True
- # Set the size of the dumped images, use =x or =[height,width] to match the
- # shortest edge to x or to resize the image to (height, width), use =-1 for no
- # resizing (default). This only affects the output size of the images dumped to
- # the output folder with dump_dataset=True. To change the size of images
- # uploaded to the lightly platform or your cloud bucket please use the
- # lightly.resize option instead.
- output_image_size: -1
- output_image_format: 'png'
-
- # Upload the dataset to the Lightly platform.
- upload_dataset: False
-
- # pretagging
- pretagging: False
- pretagging_debug: False
- pretagging_config: ''
- pretagging_upload: False
-
- # Append weak labels.
- append_weak_labels: False
-
- # Normalize the embeddings to unit length.
- normalize_embeddings: True
-
- # active learning scorer
- scorer: 'object-frequency'
- scorer_config:
- frequency_penalty: 0.25
- min_score: 0.9
-
-
- # Selection
- # Choose from ['coreset', 'random'].
- method: 'coreset'
- # Choose when to stop the selection
- stopping_condition:
- # The maximum number of samples selected
- # Float in [0., 1.] for percentage, int for number of samples, -1 means inactive.
- n_samples: -1
- # Float, minimum distance between two selected images. -1 means inactive.
- min_distance: -1.
- selected_sequence_length: 1
-
- # datapool
- datapool:
- # Name of the datapool. This will create a local datapool.
- name:
- # If True keeps backup of all previous data pool states.
- keep_history: True
- # Dataset id from Lightly platform where the datapool should be hosted.
- dataset_id:
-
- # datasource
- # By default only new samples in the datasource are processed. Set process_all
- # to True to reprocess all samples in the datasource.
- datasource:
- # Dataset id from the Lightly platform.
- dataset_id:
- # Set to True to reprocess all samples in the datasource.
- process_all: False
- # Update datapool with the selected samples.
- enable_datapool_update: True
- # Use video metadata to determine the number of frames in each video. Set to
- # True for faster processing. Set to False if you get video related errors.
- use_frame_count_metadata: False
-
- # active learning
- active_learning:
- task_name: ''
- score_name: 'uncertainty_margin'
-
- # object level
- object_level:
- # Name of the object prediction task which contains all the object annotations.
- task_name: ''
- # Name of the additional crop dataset on the Lightly Platform. A new dataset
- # is created if no dataset with this name exists.
- crop_dataset_name: ''
- # Padding relative to the bbox size
- padding: 0.0
-
- # Upload report to the Ligthly platform.
- upload_report: True
- # The number of retained/removed image pairs shown in the report.
- n_example_images: 6
- # Maximum size of the distance matrix allowed for report statistics in GB.
- memory_requirement_in_GB: 2
- # Show timestamps of the selected frames for each video in the report. Set this
- # to False if you observe slow report generation or work with many videos (>20).
- show_video_sampling_timeline: True
-
- # optional deterministic unique output subdirectory for run, in place of timestamp
- run_directory:
-
-Additionally, you can pass all arguments which can be passed to the lightly CLI tool with the `lightly` prefix.
-For example,
-
-.. code-block:: console
-
- docker run --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- lightly.loader.batch_size=512
-
-sets the batch size during training and embedding to 512. You find a list of all
-lightly CLI parameters here: :ref:`ref-cli-config-default`
-
-Choosing the Right Parameters
------------------------------------
-
-Below you find some distributions and the resulting histogram of the pairwise
-distances. Typically, datasets consist of multiple normal or uniform
-distributions (second row). This makes sense. In autonomous driving, we collect
-data in various cities, different weather conditions, or other factors. When
-working with video data from multiple cameras each camera might form a cluster
-since images from the same static camera have lots of perceptual similarity.
-
-The more interesting question is what kind of distribution you're aiming for.
-
-
-**If we want to diversify the dataset** (e.g. create a really hard test set
-covering all the special cases) we might want to aim for what looks like a grid.
-The log histogram (yes, we plot the histograms in log scale!) for a grid pattern with
-equal distance between two neighboring samples looks like a D.
-
-
-**If you want to remove nearby duplicates** (e.g. reduce overfitting and bias)
-we see good results when trying to sample using the *min_distance* stop condition.
-E.g. set the *min_distance* to 0.1 to get rid of the small peak (if there is any)
-close to 0 pairwise distance.
-
-
-.. image:: images/histograms_overview.png
-
-
-
-Increase I/O Performance
------------------------------------
-During the embedding process, the I/O bandwidth can often slow down the computation. A progress bar shows you the current compute
-efficiency which is calculated as the time spent on computation compared to overall time per batch. A number close to 1.0 tells you
-that your system is well utilized. A number close to 0.0 however, suggests that there is an I/O bottleneck. This can be the case for
-datasets consisting of very high-resolution images. Loading them from harddisk and preprocessing can take a lot of time.
-
-To mitigate the effect of low I/O speed one can use background workers to load the data. First, we need to tell Docker to use
-the host system for inter-process communication. Then, we can tell the filter to use multiple workers for data preprocessing.
-You can use them by adding the following two parts to your docker run command:
-
-* **-\-ipc="host"** sets the host for inter-process communication.
- This flag needs to be set to use background workers. Since this is an argument
- to the docker run command we add it before our filter arguments.
-
-* **lightly.loader.num_workers=8** sets the number of background processes
- to be used for data preprocessing. Usually, the number of physical
- CPU cores works well.
-
-.. code-block:: console
-
- docker run --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/output_dir \
- --ipc=host \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- lightly.loader.num_workers=8
diff --git a/docs/source/docker_archive/configuration/images/histograms_overview.png b/docs/source/docker_archive/configuration/images/histograms_overview.png
deleted file mode 100644
index 4e252a1ff..000000000
Binary files a/docs/source/docker_archive/configuration/images/histograms_overview.png and /dev/null differ
diff --git a/docs/source/docker_archive/examples/academic_datasets.rst b/docs/source/docker_archive/examples/academic_datasets.rst
deleted file mode 100644
index 01ae3e08e..000000000
--- a/docs/source/docker_archive/examples/academic_datasets.rst
+++ /dev/null
@@ -1,121 +0,0 @@
-ImageNet
-===================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Let's have a look at how to run the docker container to analyze and filter the famous
-ImageNet dataset. You can reproduce the sample report using the following
-command.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v /datasets/imagenet/train/:/home/input_dir:ro \
- -v /datasets/docker_imagenet_500k:/home/output_dir \
- --ipc="host" \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- lightly.collate.input_size=64 \
- lightly.loader.batch_size=256 \
- lightly.loader.num_workers=8 \
- lightly.trainer.max_epochs=0 \
- stopping_condition.n_samples=500000 \
- remove_exact_duplicates=True \
- enable_corruptness_check=False
-
-The complete **processing time** was **04h 37m 02s**. The machine used for this experiment is a cloud instance with
-8 cores, 30GB of RAM, and a V100 GPU. The dataset was stored on an SSD drive.
-
-You can also use the direct link for the
-`ImageNet `_ report.
-
-
-
-
-
-Combining Cityscapes with Kitti
-================================
-
-Using Lightly Docker and the datapool feature we can combine two datasets and
-ensure that we only keep the unique samples.
-
-.. code-block:: console
-
- docker run --shm-size="512m" --gpus all --rm -it \
- -v /datasets/cityscapes/leftImg8bit/train/:/home/input_dir:ro \
- -v /datasets/docker_out_cityscapes:/home/output_dir \
- -v /datasets/docker_out_cityscapes:/home/shared_dir \
- -e --ipc="host" --network="host" lightly/worker:latest \
- token=MYAWESOMETOKEN lightly.loader.num_workers=8 \
- stopping_condition.min_distance=0.2 remove_exact_duplicates=True \
- enable_corruptness_check=False enable_training=True \
- lightly.trainer.max_epochs=20 lightly.optimizer.lr=1.0 \
- lightly.trainer.precision=32 lightly.loader.batch_size=256 \
- lightly.collate.input_size=64 datapool.name=autonomous_driving
-
-The report for running the command can be found here:
-:download:`Cityscapes.pdf <../resources/datapool_example_cityscapes.pdf>`
-
-Since the Cityscapes dataset has subfolders for the different cities Lightly
-Docker uses them as weak labels for the embedding plot as shown below.
-
-.. figure:: ../resources/cityscapes_scatter_umap_k_15_no_overlay.png
- :align: center
- :alt: some alt text
-
- Scatterplot of Cityscapes. Each color represents one of the 18
- subfolders (cities) of the Cityscapes dataset.
-
-
-Now we can use the datapool and pre-trained model to select the interesting
-frames from Kitti and add them to Cityscapes:
-
-.. code-block:: console
-
- docker run --shm-size="512m" --gpus all --rm -it \
- -v /datasets/kitti/training/image_2/:/home/input_dir:ro \
- -v /datasets/docker_out_cityscapes:/home/output_dir \
- -v /datasets/docker_out_cityscapes:/home/shared_dir \
- -e --ipc="host" --network="host" lightly/worker:latest \
- token=MYAWESOMETOKEN lightly.loader.num_workers=8 \
- stopping_condition.min_distance=0.2 remove_exact_duplicates=True \
- enable_corruptness_check=False enable_training=False \
- lightly.trainer.max_epochs=20 lightly.optimizer.lr=1.0 \
- lightly.trainer.precision=32 lightly.loader.batch_size=256 \
- lightly.collate.input_size=64 datapool.name=autonomous_driving
-
-
-We will end up with new plots in the report due to the datapool. The plots show
-the embeddings and highlight with blue color the samples which have been added
-from the new dataset. In our experiment, we see that Lighlty Docker added several
-new samples outside of the previous embedding distribution. This is great, since it
-shows that Cityscapes and Kitti have different data and we can combine the two datasets.
-
-.. figure:: ../resources/datapool_umap_scatter_before_threshold_0.2.png
- :align: center
- :alt: An example of the newly selected examples when we use
- stopping_condition.min_distance=0.2
-
- An example of the newly selected examples when we use
- stopping_condition.min_distance=0.2. 7089 samples from Kitti have been added
- to our existing datapool.
-
-.. figure:: ../resources/datapool_umap_scatter_before_threshold_0.05.png
- :align: center
- :alt: An example of the newly selected examples when we use
- stopping_condition.min_distance=0.05
-
- An example of the newly selected examples when we use
- stopping_condition.min_distance=0.05. 3598 samples from Kitti have been added
- to our existing datapool.
-
-
-The report for running the command can be found here:
-:download:`kitti_with_min_distance=0.2.pdf <../resources/datapool_example_kitti_threshold_0.2.pdf>`
-
-And the report for stopping condition mininum distance of 0.05:
-:download:`kitti_with_min_distance=0.05.pdf <../resources/datapool_example_kitti_threshold_0.05.pdf>`
\ No newline at end of file
diff --git a/docs/source/docker_archive/examples/datasets_in_the_wild.rst b/docs/source/docker_archive/examples/datasets_in_the_wild.rst
deleted file mode 100644
index 1f784b5ce..000000000
--- a/docs/source/docker_archive/examples/datasets_in_the_wild.rst
+++ /dev/null
@@ -1,188 +0,0 @@
-Extract Diverse Video Frames
-=============================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-The following example is a showcase how the lightly docker solution can be used
-to extract frames from a video based on their uniqueness
-rather than based on timestamps.
-
-
-Using ffmpeg
------------------------------
-
-Using tools such as ffmpeg we can extract frames from a video
-using a simple one-liner like this:
-
-.. code-block:: console
-
- # extract all frames from video.mp4 as .png files and store in frames/ folder
- ffmpeg -i video.mp4 frames/%d.png
-
-ffmpeg allows us to use various flags to choose framerate, crop the images,
-resize the images or set the quality as shown here:
-
-.. code-block:: console
-
- # set framerate to 5 fps
- ffmpeg -i video.mp4 -filter:v "fps=5" frames/%d.png
-
- # resize image to 256x256 pixels
- ffmpeg -i video.mp4 -s 256x256 frames/%d.png
-
- # extract frames as .jpg files
- # high quality jpg compression
- ffmpeg -i video.mp4 -qscale:v 1 frames/%d.jpg
-
- # lower quality jpg compression
- ffmpeg -i video.mp4 -qscale:v 5 frames/%d.jpg
-
- # crop a 480x480 image with 80 pixels offset in x direction
- ffmpeg -i video.mp4 -filter:v "crop=480:480:80:0" frames/%d.png
-
- # and many more ...
-
-However, the problem is the extracted frames sum up and use lots of storage.
-For most training tasks, we don't even want to extract all the frames. Limiting
-the framerate is very easy and helps us reduce the amount of extracted data.
-On the other hand, even a video with 5 fps might contain lots of similar frames
-or even worse, we might miss some frames with lots of "action".
-
-Using Lightly Docker
------------------------------
-
-Lightly Docker has been designed to give engineers an alternative to using
-fixed framerates for frame extraction.
-
-How about selecting frames based on their similarity?
-
-In this example, we use the following video: https://www.pexels.com/de-de/video/3719157/
-
-We download the video to a local folder */dataset/video/*. We can use wget in
-a terminal under linux or MacOS to download the video (just make sure you
-navigated to the directory where you want to download the video to).
-
-Let us extract frames from the video using ffmpeg. We want to get 5 frames per
-second (fps). Create a new directory called */dataset/frames_ffmpeg/*. Using ffmpeg we can
-extract the frames with the following command:
-
-.. code-block:: console
-
- ffmpeg -i raw/video.mp4 -filter:v "fps=5" frames_ffmpeg/%d.png
-
-
-Extracting the frames without introducing compression artifacts is using lots of
-storage. In this example, we have a small video of 6.4 MBytes. Once extracted,
-the .png frames together with the video consume 553.4 MBytes. This is a
-70x increase!
-
-.. list-table::
- :widths: 50 50 50 30
- :header-rows: 1
-
- * - Metric
- - ffmpeg extracted frames
- - Lightly using video
- - Reduction
- * - Storage Consumption
- - 447 MBytes + 6.4 MBytes
- - 6.4 MBytes
- - 70.84x
-
-.. note:: Why not extract the frames as compressed .jpg images? Extracting the
- frames as .jpg would indeed reduce storage consumption. The video from
- our example would end up using (14 MBytes + 6.4 MBytes). However, for
- critical applications where robustness and accuracy of the model are
- key, we have to think about the final system in production. Is your
- production system working with the raw camera stream (uncompressed) or
- with compressed frames (e.g. .jpg)? Very often we don’t have time to
- compress a frame in real-time systems or don’t want to introduce
- compression artifacts. You should also think about whether you want
- to train a model on compressed data whereas in production is runs
- using raw data.
-
-Now we want to do the same using Lightly Docker. Since the ffmpeg command
-extracted 99 frames let's extract 99 frames as well:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it -v /dataset/video/:/home/input_dir:ro \
- -v \/datasets/videos/docker_out:/home/output_dir \
- -v /datasets/docker_shared_dir:/home/shared_dir -e --ipc="host" \
- --network="host" lightly/worker:latest token=MYAWESOMETOKEN \
- lightly.collate.input_size=64 lightly.loader.batch_size=32 '
- lightly.loader.num_workers=8 lightly.trainer.max_epochs=10 \
- stopping_condition.n_samples=100 remove_exact_duplicates=True \
- enable_corruptness_check=False enable_training=False dump_dataset=True \
- method=coreset
-
-To perform a random selection we can simply replace coreset with random as
-our selected method. Note that coreset is the default method.
-
-Let's have a look at some statistics of the two obtained datasets:
-
-.. list-table::
- :widths: 50 50 50 50 50
- :header-rows: 1
-
- * - Metric
- - original dataset
- - after ffmpeg
- - after random
- - after coreset
- * - Number of Samples
- - 475
- - 99
- - 99
- - 99
- * - L2 Distance (Mean)
- - 1.2620
- - 1.2793
- - 1.2746
- - 1.3711
- * - L2 Distance (Min)
- - 0.0000
- - 0.0000
- - 0.0586
- - 0.2353
- * - L2 Distance (Max)
- - 1.9835
- - 1.9693
- - 1.9704
- - 1.9470
- * - L2 Distance (10th Percentile)
- - 0.5851
- - 0.5891
- - 0.5994
- - 0.8690
- * - L2 Distance (90th Percentile)
- - 1.8490
- - 1.8526
- - 1.8525
- - 1.7822
-
-We notice the following when looking at this table:
-
-- The **min distance** between two samples was 0 after ffmpeg selection whereas the
- min distance significantly increased using coreset selection strategy.
-
- - 0 distance means that there are at least two samples completely identical
- (e.g. two frames in the video are the same)
-
-- The **mean distance** between the original dataset, ffmpeg, as well as
- random selection, is very similar. The coreset selection however differs
- significantly with a higher mean (higher diversity) in the selected dataset.
-
-- The **10th percentile** shows similar behavior to the mean distance.
-
-As you see in this example just selecting every N-th frame is similar to
-selecting frames randomly. More sophisticated selection strategies, such as the coreset selection strategy, result in
-much higher sample diversity. The docker has been optimized for these selection strategies.
-
-.. note:: Note that by default the embeddings of the dataset will be normalized
- to unit vector length. Max L2 distance between two vectors is
- therefore 2.0 (two vectors pointing in opposite directions).
\ No newline at end of file
diff --git a/docs/source/docker_archive/examples/overview.rst b/docs/source/docker_archive/examples/overview.rst
deleted file mode 100644
index 61997e854..000000000
--- a/docs/source/docker_archive/examples/overview.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Examples
-=================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-We provide examples of how Lightly Docker can be used on various academic and
-wild datasets.
-
-.. toctree::
- :maxdepth: 1
-
- datasets_in_the_wild.rst
- academic_datasets.rst
\ No newline at end of file
diff --git a/docs/source/docker_archive/getting_started/first_steps.rst b/docs/source/docker_archive/getting_started/first_steps.rst
deleted file mode 100644
index aedc5356b..000000000
--- a/docs/source/docker_archive/getting_started/first_steps.rst
+++ /dev/null
@@ -1,635 +0,0 @@
-.. _rst-docker-first-steps:
-
-First Steps
-===================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-The Lightly Docker solution follows a train, embed, select flow using
-self-supervised learning.
-
-.. code-block:: console
-
- +-------+ +-------+ +--------+
- | Train +----->+ Embed +----->+ Select |
- +-------+ +-------+ +--------+
-
-#. You can either use a pre-trained model from the model zoo or fine-tune
- a model on your unlabeled dataset using self-supervised learning. The output
- of the train step is a model checkpoint.
-
-#. The embed step creates embeddings of the input dataset. Each sample gets
- represented using a low-dimensional vector. The output of the embed step is
- a .csv file.
-
-#. Finally, based on the embeddings and additional information we can use
- one of the selection strategies to pick the relevant data for you.
- The output of the select step is a list of filenames as well as
- analytics in form of a pdf report with plots.
-
-You can use each of the three steps independently as well. E.g. you can use
-the Lightly Docker to embed a dataset and train a linear classifier on top of
-them.
-
-
-The docker solution can be used as a command-line interface. You run the container, tell it where to find data, and where to store the result. That's it.
-There are various parameters you can pass to the container. We put a lot of effort to also expose the full Lightly\ **SSL** framework configuration.
-You could use the docker solution to train a self-supervised model instead of using the Python framework.
-
-Before jumping into the detail let's have a look at some basics.
-The docker container can be used as a simple script. You can control parameters by changing flags.
-
-Use the following command to get an overview of the available parameters:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it lightly/worker:latest --help
-
-.. note:: In case the command fails because docker does not detect your GPU
- you want to make sure `nvidia-docker` is installed.
- You can follow the guide
- `here `_.
-
-
-Storage Access
------------------------------------
-
-We use volume mapping provided by the docker run command to process datasets.
-A docker container itself is not considered to be a good place to store your data.
-Volume mapping allows the container to work with the filesystem of the host system.
-
-
-There are **three** types of volume mappings:
-
-* **Input Directory:**
- The input directory contains the dataset we want to process. The format of the input data should be either a single
- folder containing all the images or a folder containing a subfolder which holds the images.
- See the tutorial :ref:`input-structure-label` for more information.
- The container has only **read access** to this directory (note the *:ro* at
- the end of the volume mapping).
- Instead of using a local input directory you can also use a cloud storage
- bucket on S3, GCS, or Azure as a remote datasource. For reference, head to
- :ref:`ref-docker-with-datasource`.
-* **Shared Directory:**
- The shared directory allows the user to pass additional inputs such as embeddings or model checkpoints
- to the container. The checkpoints should be generated by the lightly Python package or by the docker
- container and the embeddings should be in the format specified in the tutorial "Structure Your Input".
- The container requires only **read access** to this directory.
-* **Output Directory:**
- The output directory is the place where the results from all computations made by the container are stored.
- See `Reporting`_ and `Docker Output`_ for additional information. The container requires **read and
- write access** to this directory.
-
-.. note:: Docker volume or port mappings always follow the scheme that you first
- specify the host systems port followed by the internal port of the
- container. E.g. **-v /datasets:/home/datasets** would mount /datasets
- from your system to /home/datasets in the docker container.
-
-
-Typically, your docker command would start like this:
-
-- Map *{INPUT_DIR}* (from your system) to */home/input_dir* in the container
-
- *e.g. /path/to/my/cat/dataset:/home/input_dir:ro*
-
-- Map *{OUTPUT_DIR}* (from your system) to */home/output_dir* in the container
-
- *e.g. /path/where/I/want/the/docker/output:/home/output_dir*
-
-- Specify the token to authenticate your user
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN
-
-Now, let's see how this will look in action!
-
-.. note:: Learn how to obtain your :ref:`ref-authentication-token`.
-
-.. warning:: Don't forget to replace **{INPUT_DIR}** and **{OUTPUT_DIR}** with the path
- to your local input and output directory. You must not change the
- path after the **:** since this path is describing the internal
- file system within the container!
-
-When running the above docker command you will find a new folder with the current date
-and time in the {OUTPUT_DIR} folder. This can be inconvenient if you want to run the docker
-in an automated pipeline as the current date and time change.
-
-Using the **run_directory** parameter you can use a custom and deterministic output folder.
-The following docker run command would for example store the output in the
-*{OUTPUT_DIR}/docker_out* folder.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- run_directory="docker_out"
-
-Specify Relevant Files
-----------------------------
-Oftentimes not all files in a directory are relevant. In that case, it's possible
-to pass a list of filenames to the Lightly docker
-using the `relevant_filenames_file` configuration option.
-It will then only consider the listed filenames
-and ignore all others. To do so, you can create a text file which
-contains one relevant filename per line and then pass the path to the text file
-to the docker run command. This works for videos and images.
-
-For example, if this is your input directory:
-
-.. code-block:: console
-
- /path/to/my/data/
- L my-video.mp4
- L my-other-video.mp4
- L some/subfolder/
- L my-third-video.mp4
-
-Then you can specify two input files by creating the following **filenames.txt**:
-
-.. code-block:: console
-
- my-video.mp4
- some/subfolder/my-third-video.mp4
-
-If you use a cloud bucket as input datasource, upload the file to it
-and copy the path of the file relative to the datasource root.
-If you use a cloud bucket and specified a separate input and output bucket,
-put the file in the .lightly folder of the output bucket
-and copy the path of the file relative to the output datasource root.
-
-E.g. if your dataset is at `path/to/dataset` and your relevant_filenames.txt at
-`path/to/dataset/subdir/relevant_filenames.txt`,
-then copy the path `subdir/relevant_filenames.txt`.
-
-If you use a local input directory, place the file in the shared directory
-and copy the path relative to it.
-
-Then you can add `relevant_filenames_file='subdir/relevant_filenames.txt'`
-to the docker run command and the Lightly docker will only consider **my-video.mp4** and **my-third-video.mp4**.
-
-Embedding a Dataset and Selecting from it
------------------------------------------
-
-To embed your images with a pre-trained model, you can run the docker solution with this command:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- remove_exact_duplicates=True \
- enable_corruptness_check=True \
- stopping_condition.n_samples=0.3
-
-The command above does the following:
-
-- **remove_exact_duplicates=True** Check your dataset for corrupt images
-
-- **enable_corruptness_check=True** Removes exact duplicates
-
-- **stopping_condition.n_samples=0.3** Selects 30% of the images using the
- default method (coreset). Selecting 30% means that the remaining dataset
- will be 30% of the initial dataset size. You can also specify the exact
- number of remaining images by setting **n_samples** to an integer value.
-
- This allows you to specify the minimum allowed distance between two image
- embeddings in the output dataset. After normalizing the input embeddings
- to unit length, this value should be between 0 and 2. This is often a more
- convenient method when working with different data sources and trying to
- combine them in a balanced way.
-
-- **stopping_condition.min_distance=0.2** would remove all samples which are
- closer to each other than 0.2.
-
-The docker creates just an output file with the selected
-filenames for you. You can also tell the program to copy the selected files into
-the output folder by adding the parameter **dump_dataset=True** to the command.
-
-
-Train a Self-Supervised Model
------------------------------------
-
-Sometimes it may be beneficial to finetune a self-supervised model on your
-dataset before embedding the images. This may be the case when the dataset is
-from a specific domain (e.g. for medical images).
-
-The command below will **train a self-supervised model** for (default: 100)
-epochs on the images stored in the input directory before embedding the images
-and selecting from them.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- enable_training=True
-
-The training of the model is identical to using the lightly open-source package
-with the following command:
-
-.. code-block:: console
-
- lightly-train input_dir={INPUT_DIR}
-
-**Checkpoints** from your training process will be stored in the output directory. You can continue training from such
-a checkpoint by copying the checkpoint to the shared directory and then passing the checkpoint filename to the container:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- stopping_condition.n_samples=0.3 \
- enable_training=True \
- checkpoint=lightly_epoch_99.ckpt
-
-You may not always want to train for exactly 100 epochs with the default settings. The next section will
-explain how to customize the default settings.
-
-
-Accessing Lightly Input Parameters
------------------------------------
-The docker container is a wrapper around the lightly Python package.
-Hence, for training and embedding the user can access all the settings from
-the lightly command-line tool. Just prepend the parameter with **lightly** to
-do so.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- remove_exact_duplicates=True \
- enable_corruptness_check=True \
- stopping_condition.n_samples=0.3 \
- enable_training=True \
- lightly.trainer.max_epochs=10 \
- lightly.collate.input_size=64 \
- lightly.loader.batch_size=256 \
- lightly.trainer.precision=16 \
- lightly.model.name=resnet-101
-
-
-
-A list of all input parameters can be found here: :ref:`rst-docker-parameters`
-
-
-.. _docker-sampling-from-embeddings:
-
-Selecting from Embeddings File
-----------------------------------
-
-It is also possible to sample directly from embedding files generated by previous runs. For this,
-move the embeddings file to the shared directory, and specify the filename like so:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- remove_exact_duplicates=True \
- enable_corruptness_check=False \
- stopping_condition.n_samples=0.3 \
- embeddings=my_embeddings.csv
-
-The embeddings file should follow the structure of the .csv file created by the
-lightly CLI: :ref:`ref-cli-embeddings-lightly` or as described in :ref:`ref-docker-meta-information`.
-
-Manually Inspecting the Embeddings
-----------------------------------
-Every time you run Lightly Docker you will find an `embeddings.csv` file in the
-output directory. This file contains the embeddings of all samples in your dataset.
-You can use the embeddings for clustering or manual inspection of your dataset.
-
-.. figure:: images/colab_embeddings_example.png
- :align: center
- :alt: Example plot of working with embeddings.csv
-
- Example plot of working with embeddings.csv
-
-
-We provide an
-`example notebook `_
-to learn more about how to work with the embeddings.
-
-Selecting from Video Files
---------------------------
-In case you are working with video files, it is possible to point the docker container
-directly to the video files. This prevents the need to extract the individual frames beforehand.
-To do so, simply store all videos you want to work with in a single directory, the lightly software
-will automatically load all frames from the videos.
-
-.. code-block:: console
-
- # work on a single video
- data/
- +-- my_video.mp4
-
- # work on several videos
- data/
- +-- my_video_1.mp4
- +-- my_video_2.avi
-
-As you can see, the videos do not need to be in the same file format. An example command for a folder
-structure as shown above could then look like this:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- stopping_condition.n_samples=0.3
-
-Where {INPUT_DIR} is the path to the directory containing the video files.
-
-You can let Lightly Docker automatically extract the selected frames and save
-them in the output folder using `dump_dataset=True`.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- stopping_condition.n_samples=0.3 \
- dump_dataset=True
-
-
-.. note:: The `dump_dataset` feature by default saves the images in the `png` format. This can take a lot of time
- when working with high-resolution videos. You can speed up the process by specifying the output format
- `output_image_format='jpg'` or the resolution `output_image_size=X` of the images.
-
-
-Removing Exact Duplicates
----------------------------
-With the docker solution, it is possible to remove **only exact duplicates** from the dataset. For this,
-simply set the stopping condition `n_samples` to 1.0 (which translates to 100% of the data). The exact command is:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- remove_exact_duplicates=True \
- stopping_condition.n_samples=1.
-
-
-.. _ref-docker-upload-to-platform:
-
-Upload Sampled Dataset To Lightly Platform
-------------------------------------------
-
-Lightly Docker can automatically push the selected dataset as well as its
-embeddings to the Lightly Platform.
-
-Imagine you have a dataset of 100 videos with 10'000 frames each. 1 Million frames
-in total. Using Lightly Docker and the coreset method we sample the most diverse
-50'000 images (a reduction of 20x). Now we push the 50'000 images to the
-Lightly Platform for a more interactive analysis. We can access all metadata as
-well as the embedding view to explore the dataset, find clusters and further curate
-the dataset.
-Finally, we can use the Active Learning capabilities of the Lightly Platform to
-iteratively train, predict, label the dataset in chunks until we reach the desired
-model accuracy.
-
-To push the selected dataset automatically after running Lightly Docker you can
-append `upload_dataset=True` to the docker run command.
-
-E.g.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {SHARED_DIR}:/home/shared_dir \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- stopping_condition.n_samples=50'000 \
- stopping_condition.min_distance=0.3 \
- upload_dataset=True
-
-You can upload only thumbnails (to save bandwidth) or only metadata (for privacy
-sensitive data) by
-adding the argument `lightly.upload=thumbnails` or `lightly.upload=meta`.
-
-.. note:: You must specify the stopping condition `n_samples` and set the value
- below 75'000 (the current limit of a dataset in the Lightly Platform).
- We recommend setting both stopping conditions (`min_distance` and
- `n_samples`) in which case selecting stops as soon as the first
- condition is met.
-
-
-
-Reporting
------------------------------------
-
-To facilitate sustainability and reproducibility in ML, the docker container
-has an integrated reporting component. For every dataset, you run through the container
-an output directory gets created with the exact configuration used for the experiment.
-Additionally, plots, statistics, and more information collected
-during the various processing steps are provided.
-E.g. there is information about the corruptness check, embedding process and selection process.
-
-To make it easier for you to understand and discuss the dataset we put the essential information into
-an automatically generated PDF report.
-Sample reports can be found on the `Lightly website `_.
-
-
-.. _ref-docker-runs:
-
-Live View of Docker Status
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You can get a live status update of the currently running docker runs through
-the `cloud platform `_.
-
-To use the new feature simply follow the steps:
-
-#. Make sure you have the latest docker version installed
- (see :ref:`ref-docker-download-and-install`)
-#. Open a browser and navigate to the `Lightly Platform `_
-#. In the navigation menu on the top click on **My Docker Runs**
-#. Once you start the Lightly Docker you should see the dashboard of the current
- run. Please make sure that you use the same token for the docker run as you
- find in the dashboard.
-
-In the dashboard, you see a
-list of your docker runs and a live update of the active runs. Use this
-view to see whether the data selection is still running as expected.
-
-.. image:: images/docker_runs_overview.png
-
-.. note:: Note that only status updates and error messages are transmitted.
-
-
-Docker Output
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The output directory is structured in the following way:
-
-* config:
- A directory containing copies of the configuration files and overwrites.
-* data:
- The data directory contains everything to do with data.
-
- * If `enable_corruptness_check=True`, it will contain a "clean" version of the dataset.
- * If `remove_exact_duplicates=True`, it will contain a copy of the `embeddings.csv`
- where all duplicates are removed. Otherwise, it will
- simply store the embeddings computed by the model.
-
-* filenames:
- This directory contains lists of filenames of the corrupt images, removed images, selected
- images and the images which were removed because they have an exact duplicate in the dataset.
-* plots:
- A directory containing the plots which were produced for the report.
-* report.pdf
- To provide a simple overview of the filtering process the docker container automatically generates a report.
- The report contains
-
- * information about the job (duration, processed files etc.)
- * estimated savings in terms of labeling costs and CO2 due to the smaller dataset
- * statistics about the dataset before and after the selection process
- * histogram before and after filtering
- * visualizations of the dataset
- * nearest neighbors of retained images among the removed ones
-
-* **NEW** report.json
- * The report is also available as a report.json file. Any value from the pdf
- pdf report can be easily be accessed.
-
-
-Below you find a typical output folder structure.
-
-
-.. code-block:: console
-
- |-- config
- | |-- config.yaml
- | |-- hydra.yaml
- | '-- overrides.yaml
- |-- data
- | |-- al_score_embeddings.csv
- | |-- bounding_boxes.json
- | |-- bounding_boxes_examples
- | |-- embeddings.csv
- | |-- normalized_embeddings.csv
- | |-- sampled
- | '-- selected_embeddings.csv
- |-- filenames
- | |-- corrupt_filenames.txt
- | |-- duplicate_filenames.txt
- | |-- removed_filenames.txt
- | '-- sampled_filenames.txt
- |-- lightly_epoch_1.ckpt
- |-- plots
- | |-- distance_distr_after.png
- | |-- distance_distr_before.png
- | |-- filter_decision_0.png
- | |-- filter_decision_11.png
- | |-- filter_decision_22.png
- | |-- filter_decision_33.png
- | |-- filter_decision_44.png
- | |-- filter_decision_55.png
- | |-- pretagging_histogram_after.png
- | |-- pretagging_histogram_before.png
- | |-- scatter_pca.png
- | |-- scatter_pca_no_overlay.png
- | |-- scatter_umap_k_15.png
- | |-- scatter_umap_k_15_no_overlay.png
- | |-- scatter_umap_k_5.png
- | |-- scatter_umap_k_50.png
- | |-- scatter_umap_k_50_no_overlay.png
- | '-- scatter_umap_k_5_no_overlay.png
- |-- report.json
- '-- report.pdf
-
-
-
-Evaluation of the Selection Proces
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-**Histograms and Plots**
-
-The report contains histograms of the pairwise distance between images before and after the selection process.
-
-An example of such a histogram before and after filtering for the CamVid dataset consisting of 367
-samples is shown below. We marked the region which is of special interest with an orange rectangle.
-Our goal is to make this histogram more symmetric by removing samples of short distances from each other.
-
-If we remove 25 samples (7%) out of the 367 samples of the CamVid dataset the histogram looks more symmetric
-as shown below. In our experiments, removing 7% of the dataset results in a model with higher validation set accuracy.
-
-.. image:: images/histogram_before_after.jpg
-
-.. note::
-
- Why symmetric histograms are preferred: An asymmetric histogram can be the result of either a dataset with outliers or inliers.
- A heavy tail for low distances means that there is at least one high-density region with many samples very close to each other within the main cluster.
- Having such a high-density region can lead to biased models trained on this particular dataset. A heavy tail towards high distances shows that there is
- at least one high-density region outside the main cluster of samples.
-
-**Retained/Removed Image Pairs**
-
-The report also displays examples of retained images with their nearest neighbor among the removed images. This is a good heuristic to see whether the number
-of retained samples is too small or too large: If the pairs are are very different, this may be a sign that too many samples were removed. If the pairs are similar,
-it is suggested that more images are removed.
-
-With the argument stopping_condition.n_samples=X you can set the number of samples which should be kept.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- remove_exact_duplicates=True \
- enable_corruptness_check=False \
- stopping_condition.n_samples=500
-
-With the argument n_example_images you can determine how many pairs are shown. Note that this must be an even number.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- remove_exact_duplicates=True \
- enable_corruptness_check=False \
- stopping_condition.n_samples=0.3 \
- n_example_images=32
diff --git a/docs/source/docker_archive/getting_started/hardware_recommendations.rst b/docs/source/docker_archive/getting_started/hardware_recommendations.rst
deleted file mode 100644
index 176f42c9f..000000000
--- a/docs/source/docker_archive/getting_started/hardware_recommendations.rst
+++ /dev/null
@@ -1,96 +0,0 @@
-.. _ref-hardware-recommendations:
-
-Hardware recommendations
-========================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Lightly worker is usually run on dedicated hardware
-or in the cloud on a compute instance
-which is specifically spun up to run Lightly Worker standalone.
-Our recommendations on the hardware requirements of this compute instance are
-based on three criteria:
-
-- speed: The worker should process your dataset as quickly a possible.
-- cost-effectiveness: The compute instance should be economical.
-- stability: The worker should not crash because it runs out of memory.
-
-Depending on your dataset size, we recommend the following machine:
-
-- Up to 100.000 images or video frames: Use the AWS EC2 instance `g4dn.xlarge` or similar
- with 4 vCPUs, 16GB of system memory, one T4 GPU
-- Up to 1 Million images or video frames: Use the AWS EC2 instance `g4dn.2xlarge` or similar
- with 8 vCPUs, 32GB of system memory, one T4 GPU
-- More than 1 Million images or video frames: Use the AWS EC2 instance `g4dn.4xlarge` or similar
- with 16 vCPUs, 64GB of system memory, one T4 GPU
-
-You can compute the number of frames of your videos with their length and fps.
-E.g. 100 videos with 600s length each and 30 fps have 100 * 600 * 30 = 1.8 Mio frames.
-
-If you want to train an embedding model for many epochs or want to further increase computing speed,
-we recommend to switch to a V100 or A10 GPU or better.
-
-If you stream the data from a cloud bucket using the datasource feature, make sure that
-the cloud bucket is in the same region as the compute machine.
-Using the same region is very important, see also :ref:`ref-docker-network-traffic-same-region`.
-If you are using the old workflow of reading from a local disk instead, use a SSD.
-However, we recommend the workflow to stream from a cloud bucket.
-
-
-Keep the configuration option `lightly.loader.num_workers` at the default (-1),
-which will set it to the number of vCPUs on your machine.
-
-Finding the compute speed bottleneck
-------------------------------------
-
-Usually, the compute speed is limited by one of three potential bottlenecks.
-Different steps of the Lightly worker use these resources to a different extent.
-Thus the bottleneck changes throughout the run. The bottlenecks are:
-
-- data read speed: I/O
-- CPU
-- GPU
-
-
-The GPU is used during three steps:
-
-- training an embedding model (optional step)
-- pretagging your dataset (optional step)
-- embedding your dataset
-
-The I/O and CPUs are used during the previous 3 steps and also used during the other steps that may take longer:
-
-- initializing the dataset
-- corruptness check (optional step)
-- dataset dumping & upload (optional step)
-
-Before changing the hardware configuration of your compute instance,
-we recommend to first determine the bottleneck by monitoring it:
-
-- You can find out the current disk usage of your machine using the terminal command `iotop`.
-- If you use a datasource, see the current ethernet usage using the terminal command `ifstat`.
-- You can find out the current CPU and RAM usage of your machine using the terminal commands `top` or `htop`.
-- You can find out the current GPU usage (both compute and VRAM) using the terminal command `watch nvidia-smi`.
-- Note that you might need to install these commands using your package manager.
-
-
-Additional to using these tools, you can also compare the relative duration of the different steps to see the bottleneck.
-E.g. if the embedding step takes much longer than the corruptness check, then the GPU is the bottleneck.
-Otherwise, it is the I/O or CPU.
-
-Updating the machine
---------------------
-
-When updating the machine, we recommend updating the resource that causes the
-bottleneck. After that, the bottleneck might have changed.
-
-If there is not one obvious bottleneck, we recommend to scale up I/O, CPUs and GPU together.
-
-To prevent the worker running out of system memory or GPU memory, we recommend
-about 4GB of RAM and 2GB ov VRAM for each vCPU.
-
-
diff --git a/docs/source/docker_archive/getting_started/images/colab_embeddings_example.png b/docs/source/docker_archive/getting_started/images/colab_embeddings_example.png
deleted file mode 100644
index 8c1c14e74..000000000
Binary files a/docs/source/docker_archive/getting_started/images/colab_embeddings_example.png and /dev/null differ
diff --git a/docs/source/docker_archive/getting_started/images/docker_runs_overview.png b/docs/source/docker_archive/getting_started/images/docker_runs_overview.png
deleted file mode 100644
index 5b678596d..000000000
Binary files a/docs/source/docker_archive/getting_started/images/docker_runs_overview.png and /dev/null differ
diff --git a/docs/source/docker_archive/getting_started/images/docker_workers_overview_empty.png b/docs/source/docker_archive/getting_started/images/docker_workers_overview_empty.png
deleted file mode 100644
index 74ac820eb..000000000
Binary files a/docs/source/docker_archive/getting_started/images/docker_workers_overview_empty.png and /dev/null differ
diff --git a/docs/source/docker_archive/getting_started/images/docker_workers_overview_registered.png b/docs/source/docker_archive/getting_started/images/docker_workers_overview_registered.png
deleted file mode 100644
index f7481720e..000000000
Binary files a/docs/source/docker_archive/getting_started/images/docker_workers_overview_registered.png and /dev/null differ
diff --git a/docs/source/docker_archive/getting_started/images/histogram_before_after.jpg b/docs/source/docker_archive/getting_started/images/histogram_before_after.jpg
deleted file mode 100644
index 4083dff22..000000000
Binary files a/docs/source/docker_archive/getting_started/images/histogram_before_after.jpg and /dev/null differ
diff --git a/docs/source/docker_archive/getting_started/setup.rst b/docs/source/docker_archive/getting_started/setup.rst
deleted file mode 100644
index 9771c528f..000000000
--- a/docs/source/docker_archive/getting_started/setup.rst
+++ /dev/null
@@ -1,144 +0,0 @@
-.. _ref-docker-setup:
-
-Setup
-===================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-
-Analytics
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The docker container currently reports usage metrics to our analytics software
-(we use mixpanel) which uses https encrypted GET and POST requests to https://api.mixpanel.com.
-The transmitted data includes information about crashes and the number of samples
-that have been filtered. However, **the data does not include input / output samples**,
-filenames, or any other information which can be sensitive to our customers.
-
-
-
-Licensing
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The licensing and account management is done through the :ref:`ref-authentication-token`
-as if you would use lightly. The token will be used to authenticate your account.
-The authentication happens at every run of the container. Make sure the docker
-container has a working internet connection and has access to
-https://api.lightly.ai.
-
-
-.. _ref-docker-download-and-install:
-
-Download the Docker Image
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Ask your account manager from Lightly for the credentials
-to download the docker container.
-
-
-In short, installing the Docker container consists of the following steps:
-
-#. Copy the *container-credentials.json* to the instance you want to use for filtering
-#. Authenticate Docker to download the Lightly image
-#. Pull the Docker image
-#. Check whether the container works
-
-**First**, we need to access the private container registry of Lightly.
-You received a *container-credentials.json* file from your account manager.
-
-**Second**, to be able to download the docker image you need to log in with these credentials.
-The following command will authenticate your installed docker account.
-We assume *container-credentials.json* is in your current directory.
-
-.. code-block:: console
-
- cat container-credentials.json | docker login -u _json_key --password-stdin https://eu.gcr.io
-
-If the above command does not work, try the following:
-
-.. code-block:: console
-
- cat container-credentials.json | docker login -u json_key --password-stdin https://eu.gcr.io
-
-
-.. note:: When docker is freshly installed only the root user
- can run docker commands. There are two ways to work in this case.
-
-
-#. give your user permission to run - recommended
- docker (see https://docs.docker.com/engine/install/linux-postinstall/)
-#. run docker commands as root (always replace `docker` with `sudo docker`) - functional but less secure
-
-For example, to authenticate as non-root user you would run
-
-.. code-block:: console
-
- cat container-credentials.json | sudo docker login -u _json_key --password-stdin https://eu.gcr.io
-
-
-**Third**, after authentication you should be able to pull our latest image.
-Using the following command you pull the latest image from our European cloud server:
-
-.. code-block:: console
-
- docker pull eu.gcr.io/boris-250909/lightly/worker:latest
-
-.. warning::
-
- Until version 2.1.8 the latest image was named `eu.gcr.io/boris-250909/lightly/sampling:latest`
- from version 2.2 onwards the image is now called `eu.gcr.io/boris-250909/lightly/worker:latest`.
- Please make sure to update any old docker run commands to use the new image name.
-
-
-The downloaded image has a long name. We can reduce it by making use of *docker tag*.
-The following experiments are using the following image name
-*lightly/worker:latest*.
-Create a new docker tag using the following command:
-
-.. code-block:: console
-
- docker tag eu.gcr.io/boris-250909/lightly/worker:latest lightly/worker:latest
-
-
-.. note:: If you don't want to tag the image name you can replace lightly/worker:latest
- by eu.gcr.io/boris-250909/lightly/worker:latest for all commands in this documentation.
-
-.. _ref-docker-setup-sanity-check:
-
-Sanity Check
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-**Finally**, verify the correctness of the docker container by running the following command:
-
-.. code-block:: console
-
- docker run --rm -it lightly/worker:latest sanity_check=True
-
-You should see an output similar to this one:
-
-.. code-block:: console
-
- [2022-05-02 20:37:27] Lightly Docker Solution v2.2.0
- [2022-05-02 20:37:27] Congratulations! It looks like the Lightly container is running!
-
-Head on to :ref:`rst-docker-first-steps` to see how to sample your dataset!
-
-
-Update Lightly Docker
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-To update the docker we simply need to pull the latest docker image.
-
-.. code-block:: console
-
- docker pull eu.gcr.io/boris-250909/lightly/worker:latest
-
-Don't forget to tag the image again after pulling it.
-
-.. code-block:: console
-
- docker tag eu.gcr.io/boris-250909/lightly/worker:latest lightly/worker:latest
diff --git a/docs/source/docker_archive/images/lightly_docker_overview.png b/docs/source/docker_archive/images/lightly_docker_overview.png
deleted file mode 100644
index 00f319afb..000000000
Binary files a/docs/source/docker_archive/images/lightly_docker_overview.png and /dev/null differ
diff --git a/docs/source/docker_archive/integration/dagster_aws.rst b/docs/source/docker_archive/integration/dagster_aws.rst
deleted file mode 100644
index 063cf76e7..000000000
--- a/docs/source/docker_archive/integration/dagster_aws.rst
+++ /dev/null
@@ -1,513 +0,0 @@
-
-.. _ref-docker-integration-aws-dagster:
-
-Data Pre-processing Pipeline on AWS with Dagster
-===================================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-
-Introduction
---------------
-Data collection and pre-processing pipelines have become more and more automated in the recent years. The Lightly Docker can take on a crucial role
-in such a pipeline as it can reliably filter out redundant images and corrupted images with high throughput.
-
-This guide shows how to write a simple automated data pre-processing pipeline which performs the following steps:
-
-1. Download a random video from `Pexels `_.
-2. Upload the video to an S3 bucket.
-3. Run the Lightly Docker on the video to extract a diverse set of frames for further processing:
-
- a. Spin up an EC2 instance.
-
- b. Run the Lightly Docker
-
- c. Store the extracted frames in the S3 bucket
-
- d. Stop the EC2 instance
-
-Here, the first two steps simulate a data collection process.
-
-.. note::
-
- The datapool option of the Lightly Docker allows it to remember frames/images it has seen
- in past executions of the pipeline and ignore images which are too similar to already known ones.
-
-
-Dagster
----------
-Dagster is an open-source data orchestrator for machine learning. It enables building, deploying, and
-debugging data processing pipelines. Click `here `__ to learn more.
-
-
-Setting up the EC2 Instance
------------------------------
-The first step is to set up the EC2 instance. For the purposes of this tutorial,
-it's recommended to pick an instance with a GPU (like the g4dn.xlarge) and the "Deep Learning AMI (Ubuntu 18.04) Version 48.0" AMI.
-See `this guide `_ to get started. Connect to the instance.
-
-
-Next, the Lightly Docker should be installed. Please follow the instructions `here `__.
-You can test if the installation was successfull like this:
-
-.. code-block:: console
-
- docker run --rm -it lightly/worker:latest sanity_check=True
-
-To run the docker remotely, it's recommended to write a `run.sh` script with default parameters. The other parameters can then
-be changed by passing command line arguments. Use the following as a starting point and adapt it to your needs:
-
-.. code-block:: shell
-
- # general
- IMAGE=lightly/worker:latest
-
- INPUT_DIR=$1
- SHARED_DIR=/home/ubuntu/shared_dir
- OUTPUT_DIR=/home/ubuntu/lightly-aws-bucket/output_dir
-
- # api
- LIGHTLY_TOKEN=YOUR_LIGHTLY_TOKEN
-
- # run command
- docker run --gpus all --rm --shm-size="512m" \
- -v ${INPUT_DIR}:/home/input_dir \
- -v ${OUTPUT_DIR}:/home/output_dir \
- -v ${SHARED_DIR}:/home/shared_dir \
- --ipc="host" --network "host" \
- ${IMAGE} token=${LIGHTLY_TOKEN} \
- lightly.loader.num_workers=0 \
- enable_corruptness_check=True \
- remove_exact_duplicates=True \
- stopping_condition.n_samples=0.1 \
- upload_dataset=True \
- dump_dataset=True \
- datapool.name=lightly-datapool \
- >> /home/ubuntu/log.txt
-
-
-.. note::
-
- The above run command samples 10% of the frames for every input. After selection, it uploads the selected images to the Lightly Platform
- and saves them to the output directory. The datapool option allows the Lightly Docker to remember already seen frames and adapt decisions based
- on this knowledge. Learn more about the configuration of the `run.sh` file `here `_.
-
-
-
-Setting up the S3 Bucket
---------------------------
-If you don't have an S3 bucket already, follow `these `_ instructions to create one.
-For the purpose of this tutorial, name the bucket `lightly-aws-bucket`. If you want to use a different S3 bucket, remember to replace all occurences
-of `lightly-aws-bucket` in the rest of this guide.
-
-
-To access the data in the S3 bucket, the S3 bucket must be mounted on the EC2 instance. This can be done with the s3fs library.
-
-First, install the library:
-
-.. code-block:: console
-
- sudo apt install s3fs
-
-
-Then, set the `user_allow_other` flag in the `/etc/fuse.conf` file and add the following line to `/etc/fstab`:
-
-.. code-block:: console
-
- s3fs#lightly-aws-bucket /home/ubuntu/lightly-aws-bucket/ fuse _netdev,allow_other,umask=000,passwd_file=/home/ubuntu/.passwd-s3fs 0 0
-
-Finally, create a password file which contains your AWS credentials and mount the S3 bucket:
-
-.. code-block:: console
-
- echo "YOUR_AWS_ACCESS_KEY_ID:YOUR_AWS_ACCSESS_KEY" >> ~/.passwd-s3fs
- mkdir ~/lightly-aws-bucket
- sudo mount -a
-
-
-Integration
--------------
-
-Before you start, install the following dependencies:
-
-
-.. code:: console
-
- pip install pypexels
- pip install boto3
- pip install dagster
-
-
-Now that everything is setup, begin with building the data processing pipeline. Dagster's pipelines consist of several `solids` which can
-be chained one after each other. Put each solid in a separate file and aim for the following directory structure:
-
-.. code:: console
-
- ./source
- ├── aws_example_pipeline.py
- └── solids
- ├── aws
- │ ├── lightly.py
- │ └── s3.py
- └── pexels.py
-
-
-The following code is the content of `pexels.py` and represents first solid in the pipeline.
-It downloads a random video from `Pexels `_ and saves it in the current
-working directory. Don't forget to set the `PEXELS_API_KEY`.
-
-
-.. code-block:: python
-
- import os
- import string
- import random
- import requests
-
- from typing import List
-
- from pypexels import PyPexels
-
- from dagster import solid
-
-
- PEXELS_API_KEY = 'YOUR_PEXELS_API_KEY'
-
-
- class PexelsClient:
- """Pexels client to download a random popular video.
-
- """
-
- def __init__(self):
- self.api = PyPexels(api_key=PEXELS_API_KEY)
-
-
- def random_filename(self, size_: int = 8):
- """Generates a random filename of uppercase letters and digits.
-
- """
- chars = string.ascii_uppercase + string.digits
- return ''.join(random.choice(chars) for _ in range(size_)) + '.mp4'
-
-
- def download_video(self, root: str):
- """Downloads a random popular video from pexels and saves it.
-
- """
- popular_videos = self.api.videos_popular(per_page=40)._body['videos']
- video = random.choice(popular_videos)
- video_file = video['video_files'][0]
- video_link = video_file['link']
-
- video = requests.get(video_link)
-
- path = os.path.join(root, self.random_filename())
- with open(path, 'wb') as outfile:
- outfile.write(video._content)
-
- return path
-
-
- @solid
- def download_random_video_from_pexels() -> str:
- """Dagster solid to download a random pexels video to the current directory.
-
- Returns:
- The path to the downloaded video.
-
- """
-
- client = PexelsClient()
- path = client.download_video('./')
-
- return path
-
-
-The next solid in the pipeline (`s3.py`) uploads the video to the S3 bucket. It saves the video
-in a randomly created subfolder in the S3 bucket and passes the object name to the next solid.
-Set the `BUCKET_NAME` and `REGION_NAME` to your bucket name and region of the EC2 instance.
-
-
-.. code-block:: python
-
- import os
- import string
- import random
-
- import boto3
- from botocore.exceptions import ClientError
-
- from dagster import solid
-
-
- BUCKET_NAME: str = 'lightly-aws-bucket'
- REGION_NAME: str = 'YOUR_REGION_NAME' # e.g. eu-central-1
-
-
- class S3Client:
- """S3 client to upload files to a bucket.
-
- """
-
- def __init__(self):
- self.s3 = boto3.client('s3', region_name=REGION_NAME)
-
-
- def random_subfolder(self, size_: int = 8):
- """Generates a random subfolder name of uppercase letters and digits.
-
- """
- chars = string.ascii_uppercase + string.digits
- return ''.join(random.choice(chars) for _ in range(size_))
-
-
- def upload_file(self, filename: str):
- """Uploads the file at filename to the s3 bucket.
-
- Generates a random subfolder so the file will be stored at:
- >>> BUCKET_NAME/RANDOM_SUBFOLDER/basefilename.mp4
-
- """
-
- # upload file to lightly-aws-bucket/input_dir/RANDOM_STRING/basename.mp4
- object_name = os.path.join(
- 'input_dir',
- self.random_subfolder(),
- os.path.basename(filename)
- )
-
- # Upload the file
- try:
- self.s3.upload_file(filename, BUCKET_NAME, object_name)
- except ClientError as e:
- print(e)
- return None
-
- return object_name
-
-
- @solid
- def upload_video_to_s3(filename: str) -> str:
- """Dagster solid to upload a video to an s3 bucket.
-
- Args:
- filename:
- Path to the video which should be uploaded.
-
- Returns:
- The name of the object in the s3 bucket.
-
- """
-
- s3_client = S3Client()
- object_name = s3_client.upload_file(filename)
-
- return object_name
-
-
-Finally, the last solid in the pipeline (`lightly.py`) spins up the EC2 instance, runs the Lightly Docker on the object name passed
-by the last solid, and then stops the EC2 instance again. Set the `REGION_NAME`, `INSTANCE_ID`, and `MOUNTED_DIR` if
-necessary.
-
-
-.. code-block:: python
-
- import os
- import time
-
- import boto3
- from botocore.exceptions import ClientError
-
- from dagster import solid
-
-
- REGION_NAME: str = 'YOUR_REGION_NAME' # e.g. eu-central-1
- INSTANCE_ID: str = 'YOUR_INSTANCE_ID'
- MOUNTED_DIR: str = '/home/ubuntu/lightly-aws-bucket'
-
-
- class EC2Client:
- """EC2 client to start, run, and stop instances.
-
- """
-
- def __init__(self):
- self.ec2 = boto3.client('ec2', region_name=REGION_NAME)
- self.ssm = boto3.client('ssm', region_name=REGION_NAME)
-
-
- def wait(self, client, wait_for: str, **kwargs):
- """Waits for a certain status of the ec2 or ssm client.
-
- """
- waiter = client.get_waiter(wait_for)
- waiter.wait(**kwargs)
- print(f'{wait_for}: OK')
-
-
- def start_instance(self, instance_id: str):
- """Starts the EC2 instance with the given id.
-
- """
- # Do a dryrun first to verify permissions
- try:
- self.ec2.start_instances(
- InstanceIds=[instance_id],
- DryRun=True
- )
- except ClientError as e:
- if 'DryRunOperation' not in str(e):
- raise
-
- # Dry run succeeded, run start_instances without dryrun
- try:
- self.ec2.start_instances(
- InstanceIds=[instance_id],
- DryRun=False
- )
- except ClientError as e:
- print(e)
-
- self.wait(self.ec2, 'instance_exists')
- self.wait(self.ec2, 'instance_running')
-
-
- def stop_instance(self, instance_id: str):
- """Stops the EC2 instance with the given id.
-
- """
- # Do a dryrun first to verify permissions
- try:
- self.ec2.stop_instances(
- InstanceIds=[instance_id],
- DryRun=True
- )
- except ClientError as e:
- if 'DryRunOperation' not in str(e):
- raise
-
- # Dry run succeeded, call stop_instances without dryrun
- try:
- self.ec2.stop_instances(
- InstanceIds=[instance_id],
- DryRun=False
- )
- except ClientError as e:
- print(e)
-
- self.wait(self.ec2, 'instance_stopped')
-
-
- def run_command(self, command: str, instance_id: str):
- """Runs the given command on the instance with the given id.
-
- """
-
- # Make sure the instance is OK
- time.sleep(10)
-
- response = self.ssm.send_command(
- DocumentName='AWS-RunShellScript',
- Parameters={'commands': [command]},
- InstanceIds=[instance_id]
- )
- command_id = response['Command']['CommandId']
-
- # Make sure the command is pending
- time.sleep(10)
-
- try:
- self.wait(
- self.ssm,
- 'command_executed',
- CommandId=command_id,
- InstanceId=INSTANCE_ID,
- WaiterConfig={
- 'Delay': 5,
- 'MaxAttempts': 1000,
- }
- )
- except:
- # pretty print error message
- import pprint
- pprint.pprint(
- self.ssm.get_command_invocation(
- CommandId=command_id,
- InstanceId=INSTANCE_ID,
- )
- )
-
-
- @solid
- def run_lightly_onprem(object_name: str) -> None:
- """Dagster solid to run Lightly On-premise on a remote EC2 instance.
-
- Args:
- object_name:
- S3 object containing the input video(s) for Lightly.
-
- """
-
- # object name is of format path/RANDOM_DIR/RANDOM_NAME.mp4
- # so the input directory is the RANDOM_DIR
- input_dir = object_name.split('/')[-2]
-
- # input dir is mounted_dir/input_dir/batch/
- input_dir = os.path.join(MOUNTED_DIR, 'input_dir', input_dir)
-
- ec2_client = EC2Client()
- ec2_client.start_instance(INSTANCE_ID)
- ec2_client.run_command(f'/home/ubuntu/run.sh {input_dir}', INSTANCE_ID)
- ec2_client.stop_instance(INSTANCE_ID)
-
-
-To put the solids together in a single pipeline, save the following code in `aws_example_pipeline.py`:
-
-
-.. code-block:: python
-
- from dagster import pipeline
-
- from solids.pexels import download_random_video_from_pexels
- from solids.aws.s3 import upload_video_to_s3
- from solids.aws.lightly import run_lightly_onprem
-
-
- @pipeline
- def aws_example_pipeline():
- """Example data processing pipeline with Lightly on AWS.
-
- The pipeline performs the following three steps:
- - Download a random video from pexels
- - Upload the video to an s3 bucket
- - Run the Lightly pre-selection solution on the video and store the
- extracted frames in the s3 bucket
-
- """
- file_name = download_random_video_from_pexels()
- object_name = upload_video_to_s3(file_name)
- run_lightly_onprem(object_name)
-
-
-Dagster allows to visualize pipelines in a web interface. The following command
-shows the above pipeline on `127.0.0.1:3000`:
-
-.. code-block:: console
-
- dagit -f aws_example_pipeline.py
-
-
-Finally, you can execute the pipeline with the following command:
-
-
-.. code-block:: console
-
- dagster pipeline execute -f aws_example_pipeline.py
-
-For automatic execution of the pipeline you can install a cronjob, trigger the pipeline
-upon certain events, or deploy it to an `AWS EC2 or GCP GCE `_.
\ No newline at end of file
diff --git a/docs/source/docker_archive/integration/docker_trigger_from_api.rst b/docs/source/docker_archive/integration/docker_trigger_from_api.rst
deleted file mode 100644
index dcc0ae219..000000000
--- a/docs/source/docker_archive/integration/docker_trigger_from_api.rst
+++ /dev/null
@@ -1,199 +0,0 @@
-.. _integration-docker-trigger-from-api:
-
-Trigger a Docker Job from from the Platform or code
-===================================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Introduction
-------------
-The Lightly workflow can be completely automated by using the Lightly Docker as
-a background worker. In this mode, the Lightly Docker waits for incoming jobs
-through the API. After receiving a job, it will download, embed, and subsample
-the provided dataset. The results are immediately available in the webapp for
-visualization and the selected samples are sent back to your
-:ref:`cloud bucket `.
-
-Advantages
-----------
-
-- You can submit jobs through the API, fully automating the Lightly workflow.
-- You can automatically trigger a new job when data is added to your dataset.
-- Use Lightly docker a background worker and processes new jobs automatically.
-
-
-Download the Lightly Docker
----------------------------
-Please follow the instructions for the :ref:`ref-docker-setup`.
-
-
-Register the Lightly Docker as a Worker
----------------------------------------
-To control the Lightly Docker from the API you have to register it as a worker.
-You can simply go to the Lightly web app and click on My Docker Runs --> My Compute Workers.
-Or just click on the direct link here: `Docker Workers `__
-
-.. image:: ../getting_started/images/docker_workers_overview_empty.png
-
-Click on "Register" in the bottom right corner and enter a name for your worker.
-After confirmation the worker should show up in the worker list.
-
-.. image:: ../getting_started/images/docker_workers_overview_registered.png
-
-Copy the *worker id* and head over to your terminal. You can now start the docker
-with the worker id and it will connect to the API and wait for jobs. To start
-the docker execute the following command:
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v ${OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=${YOUR_LIGHTLY_PLATFORM_TOKEN} \
- worker.worker_id=${YOUR_WORKER_ID}
-
-The state of the worker on the `Docker Workers `__
-page should now indicate that the worker is in an idle state.
-
-
-Create a Dataset and Trigger a Job
------------------------------------
-
-There are two ways to trigger a new job. You can either use the user interface
-provided through our Web App or you can use our Python package and build a script.
-
-
-.. tabs::
-
- .. tab:: Web App
-
- **Create a Dataset**
-
- This recipe requires that you already have a dataset in the Lightly Platform
- configured to use the data in your cloud bucket. Create such a dataset in
- two steps:
-
- 1. `Create a new dataset `_ in Lightly.
- Make sure that you choose the input type `Images` or `Videos` correctly,
- depending on the type of files in your cloud storage bucket.
- 2. Edit your dataset, select your cloud storage provider as your
- datasource and fill out the form.
- In our example we use an S3 bucket.
-
- .. figure:: ../../getting_started/resources/LightlyEdit2.png
- :align: center
- :alt: Lightly S3 connection config
- :width: 60%
-
- Lightly S3 connection config
-
- If you don't know how to fill out the form, follow the full tutorial to create
- a Lightly dataset connected to your bucket:
- :ref:`AWS Secure Storage Solution (S3) `,
- :ref:`Google Cloud Storage (GCS) `,
- :ref:`Azure Blob Storage (Azure) `.
-
-
- .. tab:: Python Code
-
- .. literalinclude:: examples/create_dataset.py
-
-And now we can schedule a new job.
-
-.. tabs::
-
- .. tab:: Web App
-
- **Trigger the Job**
-
- To trigger a new job you can click on the schedule run button on the dataset
- overview as shown in the screenshot below:
-
- .. figure:: images/schedule-compute-run.png
-
- After clicking on the button you will see a wizard to configure the parameters
- for the job.
-
- .. figure:: images/schedule-compute-run-config.png
-
- In our example we use the following parameters.
-
- .. literalinclude:: ../code_examples/webapp_default_worker_config.txt
- :caption: Docker Config
- :language: javascript
-
- .. literalinclude:: ../code_examples/webapp_default_lightly_config.txt
- :caption: Lightly Config
- :language: javascript
-
- Once the parameters are set you can schedule the run using a click on **schedule**.
-
- .. tab:: Python Code
-
- .. literalinclude:: examples/trigger_job.py
-
-
-View the progress of the Lightly Docker
----------------------------------------
-
-To see the progress of your docker run, go to the Lightly Platform and head to
-`My Docker Runs `_
-
-.. image:: ../getting_started/images/docker_runs_overview.png
-
-
-Use your selected dataset
----------------------------
-
-Once the docker run has finished, you can see your selected dataset in the
-Lightly Platform:
-
-.. image:: ./images/webapp-explore-after-docker.jpg
-
-In our case, we had 4 short street videos with about 1000 frames each in our cloud storage
-bucket and selected 50 frames from it. Now you can analyze your dataset in the
-embedding and metadata view of the Lightly Platform, subsample it further, or
-export it for labeling. In our case we come to the conclusion that the raw data
-we have does not cover enough cases and thus decide that we want to first
-collect more street videos.
-
-Process new data in your bucket using a datapool
-------------------------------------------------
-Over time you most likely will be receiving new raw data from your various
-sources and add these to your cloud storage bucket. In our case we added 4
-additional street videos to the bucket. The new raw data might
-include samples which should be added to your dataset in the Lightly Platform,
-so you want to add a subset of them to your dataset.
-
-This workflow is supported by the Lightly Platform using a datapool. It
-remembers which raw data in your bucket has already been processed and will
-ignore it in future docker runs. Thus you can send the same job again to the
-worker. It will find your new raw data in the bucket, stream, embed
-and subsample it and then add it to your existing dataset. The selection strategies will
-take the existing data in your dataset into account when selecting new data to be
-added to your dataset.
-
-.. image:: ./images/webapp-embedding-after-2nd-docker.png
-
-After the docker run we can go to the embedding view of the Lightly Platform to
-see the newly added samples there in a new tag. We see that the new samples
-(in green) fill some gaps left by the images in the first iteration (in grey).
-However, there are still some gaps left, which could be filled by adding more
-videos to the bucket and running the docker again.
-
-This workflow of iteratively growing your dataset with the Lightly Docker has
-the following advantages:
-
-- You can learn from your findings after each iteration
- to know which raw data you need to collect next.
-- Only your new data is processed, saving you time and compute cost.
-- You don't need to configure anything, just run the same job again.
-- Only samples which are different to the existing ones are added to the dataset.
-
-If you want to search all data in your bucket for new samples
-instead of only newly added data,
-then set `datasource.process_all=True` in your docker run command.
diff --git a/docs/source/docker_archive/integration/docker_with_datasource.rst b/docs/source/docker_archive/integration/docker_with_datasource.rst
deleted file mode 100644
index c3547431e..000000000
--- a/docs/source/docker_archive/integration/docker_with_datasource.rst
+++ /dev/null
@@ -1,234 +0,0 @@
-
-.. _ref-docker-with-datasource:
-
-Using the Docker with a Cloud Bucket as Remote Datasource
-=========================================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Introduction
-------------
-The Lightly Docker can be used with the Lightly Platform to do
-the following workloads in one single run:
-
-- stream your files directly from your cloud bucket to your local machine without
- needing to sync or download them
-- embed all images or video frames
-- sample a subset, e.g. using coreset
-- compute the metadata of the images
-- create a dataset in the Lightly Platform from the selected subset
-
-Lightly supports the following cloud storage solutions:
-
-- `AWS Secure Storage Solution (S3) `_
-- `Google Cloud Storage (GCS) `_
-- `Azure Blob Storage (Azure) `_
-
-
-Advantages
-----------
-
-- You can run the whole Lightly workflow with one single command.
-- You can process videos directly without needing to extract frames.
-- Your data is streamed from your bucket on the go instead of it needing to be downloaded first
- to your local disk and then read from there. You save both a lot of time and
- disk space and can process much larger datasets.
-- You can :ref:`process new data in your bucket using a datapool `
- to continuously update and improve your selected dataset
- everytime new data comes into your bucket.
-- Your images and videos are never saved anywhere but in your bucket,
- maintaining your privacy and security.
-
-
-.. note::
-
- Please ensure that the bucket and the instance running the Lightly Docker are
- in the same cloud region (S3, Azure) or zone (GCS). If you are using S3 and
- your bucket is in `eu-central-1` ensure your EC2 instance is also running in
- `eu-central-1`. If the region or zone are note the same there can be
- **additional transfer costs** and **degraded transfer speeds**. Please consult
- the pricing page of your cloud provider for more details
- (`S3 `_,
- `GCS `_,
- `Azure `_).
-
-
-Requirements
-------------
-
-This recipe requires that you already have a dataset in the Lightly Platform
-configured to use the data in your bucket. You can create such a dataset in two
-steps:
-
-1. `Create a new dataset `_ in Lightly.
- Make sure that you choose the input type `Images` or `Videos` correctly,
- depending on the type of files in your bucket.
-2. Edit your dataset, select your datasource and fill out the form.
-
- .. figure:: ../../getting_started/resources/LightlyEdit2.png
- :align: center
- :alt: Lightly S3 connection config
- :width: 60%
-
- Lightly S3 connection config
-
-If you don't know how to fill out the form, follow the full tutorial to create
-a Lightly dataset connected to your bucket: :ref:`S3 `,
-:ref:`GCS `,
-:ref:`Azure `.
-
-
-Furthermore, you should have access to a machine running docker. Ideally, it
-also has a CUDA-GPU. A GPU will speed up the process significantly, especially
-for large datasets.
-
-
-Download the Lightly Docker
----------------------------
-Next, the Lightly Docker should be installed.
-Please follow the instructions for the :ref:`ref-docker-setup`.
-
-
-Run the Lightly Docker with the datasource
-------------------------------------------
-Head to the :ref:`rst-docker-first-steps` to get a general idea of what the docker
-can do.
-
-For running the docker with a remote datasouce,
-use the parameter `datasource.dataset_id=YOUR_DATASET_ID`.
-You find the dataset id in the Lightly Platform.
-E.g. run the docker with
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v {OUTPUT_DIR}:/home/output_dir \
- lightly/worker:latest \
- token=YOUR_LIGHTLY_PLATFORM_TOKEN \
- datasource.dataset_id=YOUR_DATASET_ID \
- stopping_condition.n_samples=50
-
-
-View the progress of the Lightly Docker
----------------------------------------
-
-To see the progress of your docker run, go to the Lightly Platform and
-head to `My Docker Runs `_
-
-.. image:: ../getting_started/images/docker_runs_overview.png
-
-Use your selected dataset
--------------------------
-
-Once the docker run has finished, you can see your selected dataset in the Lightly Platform:
-
-.. image:: ./images/webapp-explore-after-docker.jpg
-
-In our case, we had 4 short street videos with about 1000 frames each in our
-cloud storage bucket and selected 50 frames from it.
-Now you can analyze your dataset in the embedding and metadata view of the Lightly Platform,
-subsample it further, or export it for labeling.
-In our case we come to the conclusion that the raw data we have
-does not cover enough cases and thus
-decide that we want to first collect more street videos.
-
-.. _ref-docker-with-datasource-datapool:
-
-Process new data in your bucket using a datapool
-------------------------------------------------
-
-You probably get new raw data from time to time added to your bucket.
-In our case we added 4 more street videos to the bucket.
-The new raw data might include samples which should be added to your dataset
-in the Lightly Platform, so you want to add a subset of them to your dataset.
-
-This workflow is supported by the Lightly Platform using a datapool.
-It remembers which raw data in your bucket has already been processed
-and will ignore it in future docker runs.
-Thus you can run the docker with the same command again. It will find
-your new raw data in the bucket, stream, embed and subsample it and then add it to
-your existing dataset. The selection strategy will take the existing data in your dataset
-into account when selecting new data to be added to your dataset.
-
-.. image:: ./images/webapp-embedding-after-2nd-docker.png
-
-After the docker run we can go to the embedding view of the Lightly Platform
-to see the newly added samples there in a new tag. We see that the new samples
-(in green) fill some gaps left by the images in the first iteration (in grey).
-However, there are still some gaps left, which could be filled by adding more videos
-to the bucket and running the docker again.
-
-This workflow of iteratively growing your dataset with the Lightly Docker
-has the following advantages:
-
-- You can learn from your findings after each iteration
- to know which raw data you need to collect next.
-- Only your new data is processed, saving you time and compute cost.
-- You don't need to configure anything, just run the same command again.
-- Only samples which are different to the existing ones are added to the dataset.
-
-If you want to search all data in your bucket for new samples
-instead of only newly added data,
-then set `datasource.process_all=True` in your docker run command.
-
-
-.. _ref-docker-network-traffic-same-region:
-
-Network traffic
----------------
-
-Please ensure that the bucket and the instance running the Lightly Docker are
-in the same cloud region (S3, Azure) or zone (GCS). E.g. if you are using S3,
-have the instance running in `eu-central-1` and the bucket also in
-`eu-central-1`. If the region or zone are not the same there can be
-**additional transfer costs** and **degraded transfer speeds**. Please consult
-the pricing page of your cloud provider for more details
-(`S3 `_,
-`GCS `_,
-`Azure `_).
-
-
-The worker causes significant network traffic at the following steps:
-
-For image datasets:
-^^^^^^^^^^^^^^^^^^^
-
-- The corruptness check downloads the complete dataset.
-- Training the embedding model downloads the complete dataset once each epoch.
-- Embedding downloads the non-corrupt dataset.
-- Pretagging downloads the non-corrupt dataset.
-- Dumping the selected dataset downloads it.
-- Updating the selected dataset in the Lightly platform
- will first download all newly selected images to compute their metadata.
-
-As an example: If you have a dataset with 10GB size
-and run Lightly with training an embedding model for 10 epochs, you will face
-at most (10 + 5) * 10GB = 150GB of download traffic.
-
-
-
-For video datasets:
-^^^^^^^^^^^^^^^^^^^
-
-.. note::
- Depending on the video format, downloading a single frame might require downloading the entire video.
- Thus downloading X frames from Y different videos might download all Y videos in worst case.
-
-- Initializing the dataset to find out the number of frames per video downloads the complete dataset.
-- The corruptness check downloads the complete dataset.
-- Training the embedding model downloads the complete dataset once each epoch.
-- Embedding downloads the non-corrupt dataset.
-- Pretagging downloads the non-corrupt dataset.
-- Dumping the selected dataset will download each frame in it.
- This might download the full dataset, if at least one frame was selected from each video.
-- Updating the selected dataset in the Lightly platform
- will first download all newly selected images to compute their metadata.
- Similar to dumping the dataset, this might download the complete dataset in worst case.
-
-As an example: If you have a dataset with 10GB size
-and run Lightly with training an embedding model for 10 epochs, you will face
-at most (10 + 6) * 10GB = 160GB of download traffic.
diff --git a/docs/source/docker_archive/integration/examples/create_dataset.py b/docs/source/docker_archive/integration/examples/create_dataset.py
deleted file mode 100644
index ce8e9aee8..000000000
--- a/docs/source/docker_archive/integration/examples/create_dataset.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import lightly
-
-# Create the Lightly client to connect to the API.
-client = lightly.api.ApiWorkflowClient(token="LIGHTLY_TOKEN")
-
-# Create a new dataset on the Lightly Platform.
-client.create_dataset("dataset-name")
-
-# Connect the dataset to your cloud bucket.
-
-# AWS S3
-client.set_s3_config(
- resource_path="s3://bucket/dataset/",
- region="eu-central-1",
- access_key="ACCESS-KEY",
- secret_access_key="SECRET",
- thumbnail_suffix=None,
-)
-
-# Google Cloud Storage
-import json
-
-client.set_gcs_config(
- resource_path="gs://bucket/dataset/",
- project_id="PROJECT-ID",
- credentials=json.dumps(json.load(open("credentials.json"))),
- thumbnail_suffix=None,
-)
-
-# Azure Blob Storage
-client.set_azure_config(
- container_name="container/dataset/",
- account_name="ACCOUNT-NAME",
- sas_token="SAS-TOKEN",
- thumbnail_suffix=None,
-)
diff --git a/docs/source/docker_archive/integration/examples/trigger_job.py b/docs/source/docker_archive/integration/examples/trigger_job.py
deleted file mode 100644
index f4749aae6..000000000
--- a/docs/source/docker_archive/integration/examples/trigger_job.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# You can reuse the client from the previous script. If you want to create a new
-# one you can uncomment the following line:
-# client = lightly.api.ApiWorkflowClient(token="LIGHTLY_TOKEN", dataset_id="DATASET_ID")
-
-# Schedule the compute run using our custom config.
-# We show here the full default config so you can easily edit the
-# values according to your needs.
-client.schedule_compute_worker_run(
- worker_config={
- "enable_corruptness_check": True,
- "remove_exact_duplicates": True,
- "enable_training": False,
- "pretagging": False,
- "pretagging_debug": False,
- "method": "coreset",
- "stopping_condition": {"n_samples": 0.1, "min_distance": -1},
- "scorer": "object-frequency",
- "scorer_config": {"frequency_penalty": 0.25, "min_score": 0.9},
- },
- lightly_config={
- "loader": {
- "batch_size": 16,
- "shuffle": True,
- "num_workers": -1,
- "drop_last": True,
- },
- "model": {"name": "resnet-18", "out_dim": 128, "num_ftrs": 32, "width": 1},
- "trainer": {"gpus": 1, "max_epochs": 100, "precision": 32},
- "criterion": {"temperature": 0.5},
- "optimizer": {"lr": 1, "weight_decay": 0.00001},
- "collate": {
- "input_size": 64,
- "cj_prob": 0.8,
- "cj_bright": 0.7,
- "cj_contrast": 0.7,
- "cj_sat": 0.7,
- "cj_hue": 0.2,
- "min_scale": 0.15,
- "random_gray_scale": 0.2,
- "gaussian_blur": 0.5,
- "kernel_size": 0.1,
- "vf_prob": 0,
- "hf_prob": 0.5,
- "rr_prob": 0,
- },
- },
-)
diff --git a/docs/source/docker_archive/integration/images/aws-s3-bucket-list.png b/docs/source/docker_archive/integration/images/aws-s3-bucket-list.png
deleted file mode 100644
index ab2012424..000000000
Binary files a/docs/source/docker_archive/integration/images/aws-s3-bucket-list.png and /dev/null differ
diff --git a/docs/source/docker_archive/integration/images/schedule-compute-run-config.png b/docs/source/docker_archive/integration/images/schedule-compute-run-config.png
deleted file mode 100644
index 4f8d83521..000000000
Binary files a/docs/source/docker_archive/integration/images/schedule-compute-run-config.png and /dev/null differ
diff --git a/docs/source/docker_archive/integration/images/schedule-compute-run.png b/docs/source/docker_archive/integration/images/schedule-compute-run.png
deleted file mode 100644
index e7a3cc75e..000000000
Binary files a/docs/source/docker_archive/integration/images/schedule-compute-run.png and /dev/null differ
diff --git a/docs/source/docker_archive/integration/images/webapp-embedding-after-2nd-docker.png b/docs/source/docker_archive/integration/images/webapp-embedding-after-2nd-docker.png
deleted file mode 100644
index 1a9d19e3d..000000000
Binary files a/docs/source/docker_archive/integration/images/webapp-embedding-after-2nd-docker.png and /dev/null differ
diff --git a/docs/source/docker_archive/integration/images/webapp-explore-after-docker.jpg b/docs/source/docker_archive/integration/images/webapp-explore-after-docker.jpg
deleted file mode 100644
index 19ad8848f..000000000
Binary files a/docs/source/docker_archive/integration/images/webapp-explore-after-docker.jpg and /dev/null differ
diff --git a/docs/source/docker_archive/integration/overview.rst b/docs/source/docker_archive/integration/overview.rst
deleted file mode 100644
index 3575463aa..000000000
--- a/docs/source/docker_archive/integration/overview.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-Integration
-===================================
-Here you learn how to integrate the Lightly Docker into data pre-processing pipelines.
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-
-.. toctree::
- :maxdepth: 1
-
- docker_with_datasource.rst
- docker_trigger_from_api.rst
- s3fs-fuse.rst
- dagster_aws.rst
diff --git a/docs/source/docker_archive/integration/s3fs-fuse.rst b/docs/source/docker_archive/integration/s3fs-fuse.rst
deleted file mode 100644
index b50133278..000000000
--- a/docs/source/docker_archive/integration/s3fs-fuse.rst
+++ /dev/null
@@ -1,218 +0,0 @@
-
-.. _ref-docker-integration-s3fs-fuse:
-
-Load data directly from S3 buckets using s3fs-fuse
-===================================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Learn how to use Lightly Docker directly with an AWS S3 bucket as input.
-
-Very often ML teams don't have all the data locally on a single machine. Instead,
-the data is stored in a cloud storage such as AWS S3 or Google Cloud Storage.
-Wouldn't it be great to directly connect to these storage buckets instead of
-having to download the data everytime to your local machine?
-
-In this example we will do the following:
-
-1. Setup `s3fs-fuse `_ to mount an S3 bucket to your local machine storage
-2. Use the Lightly docker and set the input directory to the mounted S3 bucket
-
-What is s3fs-fuse?
---------------------
-
-`s3fs-fuse `_ is an open-source project
-that allows to mount an S3 bucket to a local file storage. Only a limited set of
-file operations are supported (e.g. appending data to a file is very inefficient and slow
-as there is no direct support). However, for just reading files to train a ML model
-this approach is more than sufficient.
-
-Here are some of the limitations pointed out in the GitHub readme:
-
-.. note:: Generally S3 cannot offer the same performance or semantics as a local file system. More specifically:
-
- - random writes or appends to files require rewriting the entire object, optimized with multi-part upload copy
- - metadata operations such as listing directories have poor performance due to network latency
- - non-AWS providers may have eventual consistency so reads can temporarily yield stale data (AWS offers read-after-write consistency since Dec 2020)
- - no atomic renames of files or directories
- - no coordination between multiple clients mounting the same bucket
- - no hard links
- - inotify detects only local modifications, not external ones by other clients or tools
-
-Get an AWS Bucket and Credentials
------------------------------------
-
-From the AWS dashboard go the **S3** service (https://s3.console.aws.amazon.com/s3/home)
-and create an S3 bucket if you don't have one yet.
-
-If you don't have credentials yet you need to go to the **IAM** service
-(https://console.aws.amazon.com/iam/home) on AWS and create
-a new user. Make sure you add the **AmazonS3FullAccess** permission. Then create
-and download the credentials (.csv file). In the credentials file you should find
-the **Access key ID** and **Secret access key** we will use later.
-
-Install s3fs-fuse
--------------------
-
-In order to install `s3fs `_ we can follow the instructions from the GitHub readme.
-On `Debian 9 or newer` or `Ubuntu 16.04 or newer` we can use the following terminal instructions to install it:
-
-.. code-block:: console
-
- sudo apt install s3fs
-
-
-Below we show the output for installing s3fs on a Google Cloud Compute instance.
-
-.. code-block:: console
- :caption: Output of the install command
-
- $ sudo apt install s3fs
-
- Reading package lists... Done
- Building dependency tree
- Reading state information... Done
- The following NEW packages will be installed:
- s3fs
- 0 upgraded, 1 newly installed, 0 to remove and 81 not upgraded.
- Need to get 214 kB of archives.
- After this operation, 597 kB of additional disk space will be used.
- Get:1 http://deb.debian.org/debian buster/main amd64 s3fs amd64 1.84-1 [214 kB]
- Fetched 214 kB in 0s (8823 kB/s)
- perl: warning: Setting locale failed.
- perl: warning: Please check that your locale settings:
- LANGUAGE = (unset),
- LC_ALL = (unset),
- LC_CTYPE = "UTF-8",
- LANG = "C.UTF-8"
- are supported and installed on your system.
- perl: warning: Falling back to a fallback locale ("C.UTF-8").
- Selecting previously unselected package s3fs.
- (Reading database ... 109361 files and directories currently installed.)
- Preparing to unpack .../archives/s3fs_1.84-1_amd64.deb ...
- Unpacking s3fs (1.84-1) ...
- Setting up s3fs (1.84-1) ...
- Processing triggers for man-db (2.8.5-2) ...
-
-
-Configure S3 Credentials
---------------------------
-
-Our freshly installed `s3fs-fuse `_
-requires access to our S3 bucket. Luckily we can
-directly use a AWS credentials file. This file should look like this:
-
-.. code-block:: yaml
- :caption: Example ./aws/credentials
-
- [default]
- aws_access_key_id=AKIAIOSFODNN7EXAMPLE
- aws_secret_access_key=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
-
-Let's mount a bucket. We need to create a local folder where we want the S3
-content to be mounted on.
-
-.. code-block:: console
- :caption: Create a local folder **/s3-mount** using the terminal
-
- mkdir /s3-mount
-
-
-Now let's use s3fs to mount the bucket to our new folder. Run the following command
-in your terminal.
-
-.. code-block:: console
- :caption: Using the default aws credentials from **~/.aws/credentials**
-
- s3fs simple-test-bucket-igor /s3-mount
-
-.. note:: If you don't specify the location of the `.passwd_file` s3fs uses the default
- location of your aws credentials **~/.aws/credentials**.
-
-
-If everything went well you should now be able to see the content of your bucket
-in your **/s3-mount** folder. If you add a new file to the folder it will
-automatically be uploaded to the bucket.
-
-Optional: use a custom .passwd file for s3fs
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you don't want to use the default aws credentials you can also create a separate
-passwd file for s3fs:
-
-.. code-block:: console
-
- echo ACCESS_KEY_ID:SECRET_ACCESS_KEY > ${HOME}/.passwd-s3fs
- chmod 600 ${HOME}/.passwd-s3fs
-
-
-Now we can mount the S3 bucket using the following command in the terminal.
-
-.. code-block:: console
- :caption: Using the credentials in the **.passwd-s3fs** file
-
- s3fs simple-test-bucket-igor /s3-mount -o passwd_file=${HOME}/.passwd-s3fs
-
-Use S3 Storage with Lightly Docker
----------------------------------------
-
-Now we can use the docker run command and use the `/s3-mount` directory as the
-input dir.
-
-.. code-block:: console
-
- docker run --gpus all --rm -it \
- -v /s3-mount:/home/input_dir:ro \
- -v /docker/output:/home/output_dir \
- lightly/worker:latest \
- token=MYAWESOMETOKEN
-
-You can do the same for the docker output directory (in this example I used
-`/docker/output`). We can either use the same bucket and work on subfolders
-or use another bucket and repeat the procedure.
-
-Using a mounted S3 bucket for the docker output can be very handy.
-Using this approach the pdf report as well as all output files
-will directly be uploaded to the S3 storage and can be shared with your team.
-
-Use Caching
---------------
-
-If we use the s3fs setup to train a ML model we would iterate multiple times over
-all the images in the bucket. That would not be very efficient as we have lots
-of latency overhead as the data is streamed from the bucket. Also the costs could
-get high as we create many S3 transactions.
-
-You can specify a folder for the caching by adding `-o use_cache=/tmp` to the command:
-
-.. code-block:: console
- :caption: Using the default aws credentials from **~/.aws/credentials**
-
- s3fs simple-test-bucket-igor /s3-mount -o use_cache=/tmp
-
-
-For more information about caching checkout the
-`FAQ wiki of s3fs `_.
-
-
-Common Issues
-----------------
-
-You need to make sure the AWS S3 region is set accordingly to your bucket location.
-In your AWS S3 dashboard you find a list of S3 buckets as well as their region.
-
-.. figure:: images/aws-s3-bucket-list.png
- :align: center
- :alt: Screenshot from aws dashboard showing the S3 buckets
- :figclass: align-center
-
-
-You can manually specify the AWS region by using the `url=...` flag as shown below:
-
-.. code-block:: console
-
- s3fs simple-test-bucket-igor /s3-mount -o passwd_file=${HOME}/.passwd-s3fs -o url="https://s3-eu-central-1.amazonaws.com"
diff --git a/docs/source/docker_archive/known_issues_faq.rst b/docs/source/docker_archive/known_issues_faq.rst
deleted file mode 100644
index 350fcfef3..000000000
--- a/docs/source/docker_archive/known_issues_faq.rst
+++ /dev/null
@@ -1,167 +0,0 @@
-.. _rst-docker-known-issues-faq:
-
-Known Issues and FAQ
-===================================
-
-.. warning::
- **The Docker Archive documentation is deprecated**
-
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-Docker is slow when working with long videos
----------------------------------------------------
-
-We are working on this issue internally. For now we suggest to split the large
-videos into chunks. You can do this using ffmpeg and without losing quality.
-The following code just breaks up the video in a way that no re-encoding is needed.
-
-.. code-block:: console
-
- ffmpeg -i input.mp4 -c copy -map 0 -segment_time 01:00:00 -f segment -reset_timestamps 1 output%03d.mp4
-
-What exactly happens here?
-
-- `input.mp4`, this is your input video
-- `-c copy -map 0`, this makes sure we just copy and don't re-encode the video
-- `-segment_time 01:00:00 -f segment`, defines that we want chunks of 1h each
-- `-reset_timestamps 1`, makes sure we reset the timestamps (each video starts from 0)
-- `output%03d.mp4`, name of the output vidoes (output001.mp4, output002.mp4, ...)
-
-Docker Crashes when running with GPUs
--------------------------------------
-
-You run the docker with `--gpus all` and encounter the following error?
-
-.. code-block:: console
-
- Error response from daemon: could not select device driver "" with capabilities: [[gpu]].
-
-This error might be caused because your docker installation does not support GPUs.
-
-Try to install `nvidia-docker` following the guide
-`here `_.
-
-
-Shared Memory Error when running Lightly Worker
------------------------------------------------
-
-The following error message appears when the docker runtime has not enough
-shared memory. By default Docker uses 64 MBytes. However, when using multiple
-workers for data fetching `lightly.loader.num_workers` there might be not enough.
-
-.. code-block:: console
-
- ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
- Traceback (most recent call last):
- File "/opt/conda/envs/env/lib/python3.7/multiprocessing/queues.py", line 236, in _feed
- obj = _ForkingPickler.dumps(obj)
- File "/opt/conda/envs/env/lib/python3.7/multiprocessing/reduction.py", line 51, in dumps
- cls(buf, protocol).dump(obj)
- File "/opt/conda/envs/env/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 321, in reduce_storage
- fd, size = storage._share_fd_()
- RuntimeError: unable to write to file
-
-To solve this problem we need to increase the shared memory for the docker runtime.
-
-You can change the shared memory to 512 MBytes by adding `--shm-size="512m"` to
-the docker run command:
-
-.. code-block:: console
-
- # example of docker run with setting shared memory to 512 MBytes
- docker run --shm-size="512m" --gpus all
-
-
-Docker crashes because of too many open files
------------------------------------------------
-
-The following error message appears when the docker runtime has not enough
-file handlers. By default Docker uses 1024. However, when using multiple
-workers for data fetching `lightly.loader.num_workers` this might be not
-enough. As file handlers are used at many different parts of the code,
-the actual error message may differ. Internet connections like for
-connecting to the Lightly API also use file handlers.
-
-.. code-block:: console
-
-
-
-To solve this problem we need to increase the number of file handlers for the
-docker runtime.
-
-You can change the number of file handlers to 90000 by adding
-`--ulimit nofile=90000:90000` to the docker run command:
-
-.. code-block:: console
-
- # example of docker run with 90000 file handlers
- docker run --ulimit nofile=90000:90000 --gpus all
-
-More documentation on docker file handlers is providided `here.
-`_
-
-
-Permission denied for input created with sudo
------------------------------------------------
-
-There are some problems if the input directory was created with root/ sudo and
-the container tries to access it. This can be solved by making the files readable:
-
-.. code-block:: console
-
- # make subdirectories browsable
- find MY_INPUT_DIR -type d -exec chmod 755 {} +
-
- # make the files themselves readable
- find MY_INPUT_DIR -type f -exec chmod 644 {} +
-
-
-Error when using S3 fuse and mounting to docker
-------------------------------------------------
-
-If you use docker in combination with S3 fuse you might stumble across an issue
-that the docker container can't create the mount path for the input directory.
-
-.. code-block:: console
-
- docker: Error response from daemon: error while creating mount source path \
- '/home/ubuntu/mydataset/': mkdir /home/ubuntu/mydataset: file exists.
-
-You can resolve this problem by following the guide here:
-https://stackoverflow.com/a/61686833
-
-1. uncomment **user_allow_other** option in the **/etc/fuse.conf** file
-2. when you mount the bucket using s3fs use the **-o allow_other** option.
-
- .. code-block:: console
-
- s3fs my-s3-bucket /s3-mount -o allow_other -o use_cache=/tmp
-
-
-Token printed to shared stdout or logs
---------------------------------------
-
-The token (along with other Hydra configuration) will be printed to stdout, and so could appear in logs in an automated setup.
-
-.. code-block:: console
-
- docker run --rm -it \
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/shared_dir \
- --ipc="host" --network="host" \
- lightly/worker:latest \
- token=MYAWESOMETOKEN \
- ...
-
-This can be avoided by setting your `token` via the `LIGHTLY_TOKEN` environment variable:
-
-.. code-block:: console
-
- docker run --rm -it \
- -e LIGHTLY_TOKEN=MYAWESOMETOKEN
- -v {INPUT_DIR}:/home/input_dir:ro \
- -v {OUTPUT_DIR}:/home/shared_dir \
- --ipc="host" --network="host" \
- lightly/worker:latest \
- ...
diff --git a/docs/source/docker_archive/overview.rst b/docs/source/docker_archive/overview.rst
index 758d9a22d..64d8e2e12 100644
--- a/docs/source/docker_archive/overview.rst
+++ b/docs/source/docker_archive/overview.rst
@@ -4,83 +4,5 @@ Docker Archive
.. warning::
**The Docker Archive documentation is deprecated**
- The old workflow described in these docs will not be supported with new Lightly Worker versions above 2.6.
- Please switch to our `new documentation page `_ instead.
-
-We all know that sometimes when working with ML data we deal with really BIG datasets. The cloud solution is great for exploration, prototyping
-and an easy way to work with Lightly. But there is more!
-
-.. figure:: images/lightly_docker_overview.png
- :align: center
- :alt: Alt text
- :figclass: align-center
-
-With the introduction of our on-premise solution, you can **process larger datasets completely on your end without data leaving your infrastructure**.
-We worked hard to make this happen and are very proud to present you with the following specs:
-
-* **NEW** :ref:`ref-docker-with-datasource`
-* **NEW** :ref:`integration-docker-trigger-from-api`
-
-* :ref:`ref-docker-active-learning` using Lightly Docker
-
-* Automatically upload the selected dataset to the Lightly Platform (see :ref:`ref-docker-upload-to-platform`)
-
-* See your docker runs live in the Lightly Platform (see :ref:`ref-docker-runs`)
-
-* Lightly Docker has built-in pretagging models (see :ref:`ref-docker-pretagging`)
-
- * Use this feature to pre-label your dataset or to only select images which contain certain objects
-
- * Supported object categories are: bicycle, bus, car, motorcycle, person, train, truck
-
-* Select from more than 1 Million samples within a few hours!
-
-* Runs directly with videos without prior extraction of the frames!
-
-* Wrapped in a docker container (no setup required if your system supports docker)
-
-* Configurable
-
- * Use stopping conditions for the selection strategy such as minimum distance between
- two samples
-
- * Use various selection strategies
-
- * Check for corrupt files and report them
-
- * Check for exact duplicates and report them
-
- * We expose the full Lightly\ **SSL** framework config
-
-* Automated reporting of the datasets for each run
-
- * PDF report with histograms, plots, statistics, and much more ...
-
-* Hand-optimized code (to instruction-level)
-
- * Multithreaded
-
- * SIMD instructions
-
-* Minimal hardware requirements:
-
- * 1 CPU core
-
- * 4 GB free RAM
-
-* Recommended hardware:
-
- * see :ref:`ref-hardware-recommendations`
-
-.. toctree::
- :maxdepth: 1
-
- getting_started/setup.rst
- getting_started/first_steps.rst
- advanced/overview.rst
- integration/overview.rst
- configuration/configuration.rst
- examples/overview.rst
- known_issues_faq.rst
- getting_started/hardware_recommendations.rst
-
+ The old workflow described in these docs will not be supported with new Lightly\ **One** Worker versions above 2.6.
+ Please switch to our `new documentation page `_ instead.
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 0c3aaded0..db1572018 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -14,11 +14,11 @@ Documentation
===================================
.. note:: These pages document the Lightly self-supervised learning library.
- If you are looking for the Lightly Worker Solution with
+ If you are looking for the Lightly\ **One** Worker Solution with
advanced `active learning algorithms `_ and
`selection strategies `_ to select the best samples
within millions of unlabeled images or video frames stored in your cloud storage or locally,
- please follow our `Lightly Worker documentation `_.
+ please follow our `Lightly\ **One** Worker documentation `_.
Lightly\ **SSL** is a computer vision framework for self-supervised learning.
@@ -37,8 +37,8 @@ Lightly AI
----------
- `Homepage `_
-- `Lightly Worker Solution Documentation `_
-- `Lightly Platform `_
+- `Lightly\ **One** Worker Solution Documentation `_
+- `Lightly\ **One** Platform `_
- `Github `_
- `Discord `_ (We have weekly paper sessions!)