Create a single data row
row_data
can be a cloud storage URL (the dataset must be configured with the correct Cloud storage IAM integration), a public URL to an asset, or a local file path.
from labelbox import Client
client = Client(api_key="<YOUR_API_KEY>")
dataset = client.create_dataset(name="testing-dataset")
dataset.create_data_row(row_data="https://picsum.photos/200/300")
# It is reccomended that you use external ids but optional.
# These are useful for users to maintain references to a data_row.
dataset.create_data_row(row_data="https://picsum.photos/200/300",
global_key=str(uuid.uuid4()))
# You can also upload metadata along with your data_row
metadata_ontology = client.get_data_row_metadata_ontology()
dataset.create_data_row(row_data="https://picsum.photos/200/300",
global_key=str(uuid.uuid4()),
metadata_fields=[
DataRowMetadataField(
schema_id=mdo.reserved_by_name["tag"].uid, # specify the schema id
value="tag_string", # typed inputs
),
],
)
# Add attachment
data_row.create_attachment(attachment_type="IMAGE_OVERLAY", attachment_value="https://storage.googleapis.com/labelbox-sample-datasets/Docs/rgb.jpg", attachment_name="RGB")
Create bulk data rows
The example script below imports a set of images along with:
- Global keys
- Metadata
- Attachments
- Image layers
Limit on uploading data rows in one SDK operation
To ensure performance, we recommended uploading up to 150k data rows at one time with the
dataset.create_data_rows
methods. If you are including metadata in the same call, 30k is the limits. If you have a large dataset to upload, you can split your data rows into chunks and upload them in sequence.
import labelbox
from uuid import uuid4 ## to generate unique IDs
import datetime
client = Client(api_key="<YOUR_API_KEY>")
metadata_ontology = client.get_data_row_metadata_ontology()
dataset = client.create_dataset(name="Bulk import example")
assets = [{"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
{"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
{"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
{"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
{"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())}]
asset_metadata_fields = [{"schema_id": metadata_ontology.reserved_by_name["captureDateTime"].uid, "value": datetime.datetime.utcnow()},
{"schema_id": metadata_ontology.reserved_by_name["tag"].uid, "value": "tag_string"},
{"schema_id": metadata_ontology.reserved_by_name["split"]["train"].parent, "value": metadata_ontology.reserved_by_name["split"]["train"].uid}]
asset_attachments = [{"type": "IMAGE_OVERLAY", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/rgb.jpg", "name": "RGB" },
{"type": "IMAGE_OVERLAY", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/cir.jpg", "name": "CIR"},
{"type": "IMAGE_OVERLAY", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/weeds.jpg", "name": "Weeds"},
{"type": "TEXT", "value": "IOWA, Zone 2232, June 2022 [Text string]"},
{"type": "TEXT", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/text_attachment.txt"},
{"type": "IMAGE", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/disease_attachment.jpeg"},
{"type": "VIDEO", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/drone_video.mp4"},
{"type": "HTML", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/windy.html"}]
for item in assets:
item["metadata_fields"] = asset_metadata_fields
item["attachments"] = asset_attachments
task = dataset.create_data_rows(assets)
task.wait_till_done()
Export data rows
# default is not including metadata
data_rows = dataset.export_data_rows()
# you can include metadata in the export
data_rows = dataset.export_data_rows(include_metadata=True)
In case of a timeout error
export_data_rows()
has atimeout_seconds
parameter set to 30 seconds by default. Should a timeout error occur, the value could be modified:Example:
data_rows = dataset.export_data_rows(timeout_seconds=120)
Get data rows
You can query a data row by data row id or by global key.
# get data row by id
data_row = client.get_data_row(datarow.uid)
# You can use gloabl key (recommended) to query data row ids.
res = client.get_data_rows_for_global_keys([data_row.global_key])
data_row_ids = res['results']
# You can use external ids to query data row from your dataset,
# but this is much slower than global key (if present) and might have multiple data rows
data_row = dataset.data_row_for_external_id(data_row.external_id)
print(data_row)
Iterate through data rows in a dataset
data_rows_iterator = dataset.data_rows()
data_row = data_rows_iterator.get_one()
for data_row in dataset.data_rows():
....
Assign a data row's global key (must be unique)
# Useful for resigning urls
new_global_key = str(uuid.uuid4())
res = client.assign_global_keys_to_data_rows(
[{
"data_row_id": data_row.uid,
"global_key": new_global_key
}]
)
print(res)
Update data rows
You can update a data row's row_data, global key, and external id.
data_row.update(row_data="new_row_data_url",
global_key="new_global_key",
external_id="new_external_id"
)
For metadata update, please refer to Metadata section.
Delete data rows
# Will remove from the dataset too
data_row.delete()
# Bulk delete a list of data_rows (in this case all of them we just uploaded)
DataRow.bulk_delete(list(dataset.data_rows()))