Create a single data row

row_data can be a cloud storage URL (the dataset must be configured with the correct Cloud storage IAM integration ), a public URL to an asset, or a local file path.

from labelbox import Client
client = Client(api_key="<YOUR_API_KEY>")

dataset = client.create_dataset(name="testing-dataset")
dataset.create_data_row(row_data="https://picsum.photos/200/300")

# It is reccomended that you use external ids but optional.
# These are useful for users to maintain references to a data_row.
dataset.create_data_row(row_data="https://picsum.photos/200/300",
                        global_key=str(uuid.uuid4()))

# You can also upload metadata along with your data_row
metadata_ontology = client.get_data_row_metadata_ontology()
dataset.create_data_row(row_data="https://picsum.photos/200/300",
                        global_key=str(uuid.uuid4()),
                        metadata_fields=[
                            DataRowMetadataField(
                              schema_id=mdo.reserved_by_name["tag"].uid,  # specify the schema id
                              value="tag_string", # typed inputs
                            ),
                        ], 
)

# Add attachment
data_row.create_attachment(attachment_type="IMAGE_OVERLAY", attachment_value="https://storage.googleapis.com/labelbox-sample-datasets/Docs/rgb.jpg", attachment_name="RGB")

Create bulk data rows

The example script below imports a set of images along with:

  • Global keys
  • Metadata
  • Attachments
  • Image layers

📘

Limit on uploading data rows in one SDK operation

To ensure performance, we recommended uploading up to 150k data rows at one time with the dataset.create_data_rows methods. If you are including metadata in the same call, 30k is the limits. If you have a large dataset to upload, you can split your data rows into chunks and upload them in sequence.

import labelbox
from uuid import uuid4 ## to generate unique IDs
import datetime 

client = Client(api_key="<YOUR_API_KEY>")
metadata_ontology = client.get_data_row_metadata_ontology()

dataset = client.create_dataset(name="Bulk import example")

assets = [{"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
          {"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
          {"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
          {"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
          {"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())}]


asset_metadata_fields = [{"schema_id": metadata_ontology.reserved_by_name["captureDateTime"].uid, "value": datetime.datetime.utcnow()},
                  {"schema_id": metadata_ontology.reserved_by_name["tag"].uid, "value": "tag_string"},
                  {"schema_id": metadata_ontology.reserved_by_name["split"]["train"].parent, "value": metadata_ontology.reserved_by_name["split"]["train"].uid}]

asset_attachments = [{"type": "IMAGE_OVERLAY", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/rgb.jpg", "name": "RGB" },
                     {"type": "IMAGE_OVERLAY", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/cir.jpg", "name": "CIR"},
                     {"type": "IMAGE_OVERLAY", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/weeds.jpg", "name": "Weeds"},
                     {"type": "TEXT", "value": "IOWA, Zone 2232, June 2022 [Text string]"},
                     {"type": "TEXT", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/text_attachment.txt"},
                     {"type": "IMAGE", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/disease_attachment.jpeg"},
                     {"type": "VIDEO", "value":  "https://storage.googleapis.com/labelbox-sample-datasets/Docs/drone_video.mp4"},
                     {"type": "HTML", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/windy.html"}]

for item in assets:
  item["metadata_fields"] = asset_metadata_fields
  item["attachments"] = asset_attachments

task = dataset.create_data_rows(assets)
task.wait_till_done()

Export data rows

# default is not including metadata
data_rows = dataset.export_data_rows()

# you can include metadata in the export
data_rows = dataset.export_data_rows(include_metadata=True)

Get data rows

You can query a data row by data row id or by global key.

# get data row by id
data_row = client.get_data_row(datarow.uid)

# You can use gloabl key (recommended) to query data row ids. 
res = client.get_data_rows_for_global_keys([data_row.global_key])
data_row_ids = res['results']

# You can use external ids to query data row from your dataset,
# but this is much slower than global key (if present) and might have multiple data rows
data_row = dataset.data_row_for_external_id(data_row.external_id)
print(data_row)

Iterate through data rows in a dataset

data_rows_iterator = dataset.data_rows()
data_row = next(data_rows_iterator)

for data_row in dataset.data_rows():
  ....

Assign a data row's global key (must be unique)

# Useful for resigning urls
new_global_key = str(uuid.uuid4())
res = client.assign_global_keys_to_data_rows(
    [{
        "data_row_id": data_row.uid,
        "global_key": new_global_key
    }]
)
print(res)

Delete data rows

# Will remove from the dataset too
data_row.delete()

# Bulk delete a list of data_rows (in this case all of them we just uploaded)
DataRow.bulk_delete(list(dataset.data_rows()))