Information of data row global keys
Global keys are user-specified unique ID for your data rows that you can assign upon creation or afterward.
Requirements
Global key values are required, non-blank string values; they:
- Can be up to 200 characters in length
- Support
- Lowercase English alphabets: a, b, c, ..., z
- Uppercase English alphabets: A, B, C, ..., Z
- Numerals: 0, 1, 2, ..., 9
- Special characters:
- Exclamation mark:
!
- Underscore:
_
- Dot:
.
- Asterisk:
*
- Single quotation mark:
'
- Open parentheses:
(
- Close parentheses:
)
- Ampersand:
&
- Dollar sign:
$
- At symbol:
@
- Equals sign:
=
- Semicolon:
;
- Forward slash:
/
- Colon:
:
- Plus sign:
+
- Space:
- Comma:
,
- Question mark:
?
- Hyphen:
-
- Exclamation mark:
- Are case-insensitive
- Can include spaces within the value, but not at the beginning (leading) or the end (trailing)
Create data rows with global keys
Note: If you are using delegated access, using the data row URLs or object keys as the unique global keys to map your cloud-hosted assets with Labelbox data rows is common practice.
from labelbox import DataRow, Client
import uuid
client = Client("<YOUR_API_KEY")
dataset = client.create_dataset(name="<DATASET_NAME>")
# Create a single data row in a dataset
data_row = dataset.create_data_row(row_data="https://picsum.photos/id/829/200/300", global_key=str(uuid.uuid4()))
# Create bulk data rows in a dataset with unique keys
data_rows = [
{"row_data": "https://picsum.photos/id/829/200/300", "global_key": str(uuid.uuid4())},
{"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid.uuid4())},
]
task = dataset.create_data_rows(data_rows)
task.wait_till_done()
print(task.errors)
# None
The following code will fail to upload data rows due to duplicate global keys.
# This will have partial failure
same_key = "SAME_KEY"
# if you upload a list of data rows with duplicate keys in them, none will be uploaded.
for i in range(3):
data_rows.append(
{
"row_data": "https://picsum.photos/id/829/200/300",
"global_key": same_key
}
)
task_will_fail = dataset.create_data_rows(data_rows)
task_will_fail.wait_till_done()
print(task_will_fail.errors)
# ---
# WARNING:labelbox.schema.task:There are errors present. Please look at `task.errors` for more details
# {'message': 'Data rows contain empty string or duplicate global keys, which are not allowed'}
assets = [{"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
{"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg", "global_key": str(uuid4())},
]
# This task will succeed
task_will_succeed = dataset.create_data_rows(assets)
task_will_succeed.wait_till_done()
print(task_will_succeed.errors)
# if you upload it again with the same global keys, it will fail
task_will_fail = dataset.create_data_rows(assets)
task_will_fail.wait_till_done()
print(task_will_fail.errors)
# ----
# ...... ER_DUP_ENTRY: Duplicate entry '<the_duplicate_key>' for key 'DataRow_globalKey_organizationId_unique_idx'"
Assign global keys to existing data rows
data_row = dataset.create_data_row(row_data="google.com")
global_key = str(uuid.uuid4())
res = client.assign_global_keys_to_data_rows(
[{
"data_row_id": data_row.uid,
"global_key": global_key
}]
)
print(res)
# {'status': 'SUCCESS', 'results': [{'data_row_id': 'cl80k85x6017y0z0dcvtb4f1s', 'global_key': '745b9940-2457-4f56-b999-a2d46650dd77', 'sanitized': False}], 'errors': []}
Similarly, the assignment of global keys will fail if there are duplicate global keys in the system.
Suppose you have already run the above code cell. In the following code, if you will another data row with the same global key as above, it will fail to assign.
It is possible to have "PARTIAL SUCCESS" results. In the case that some data rows are assigned with unique global keys and some are assigned with existing global keys, the SDK will have partial success and return a payload where the results
field contains the successful ones, and the errors
contains the failed ones with error messages.
# Failure example
data_row_1 = dataset.create_data_row(row_data="yahoo.com")
res = client.assign_global_keys_to_data_rows(
[{
"data_row_id": data_row_1.uid,
"global_key": global_key #this will fail since we already used it in previous code
}]
)
print(res)
# ---
# WARNING:labelbox.client:There are errors present. Please look at 'errors' in the returned dict for more details
# {'status': 'FAILURE', 'results': [], 'errors': [{'data_row_id': 'cl847r6eo54ka070n8k5737tk', 'global_key': 'c4af584e-e3f6-4c3d-a5d9-e7244f00f48a', 'error': 'Invalid global key'}]}
# Partial Success example
data_row_2 = dataset.create_data_row(row_data="bing.com")
data_row_3 = dataset.create_data_row(row_data="duckduckgo.com")
res = client.assign_global_keys_to_data_rows(
[{
"data_row_id": data_row_2.uid,
"global_key": global_key, #this will fail since we already used it in previous code
},
{
"data_row_id": data_row_3.uid,
"global_key": str(uuid.uuid4()), # this one will succeed since it is a new global key
}
]
)
print(res) # res['results'] contains the successful ones, and the res['errors'] contains the failed ones with error messages
# ---
# WARNING:labelbox.client:There are errors present. Please look at 'errors' in the returned dict for more details
# {'status': 'PARTIAL SUCCESS', 'results': [{'data_row_id': 'cl84zf8ee03rv071b1i055nj5', 'global_key': 'ed8969a6-b6a4-445f-936e-ae2a9a6f0004', 'sanitized': False}], 'errors': [{'data_row_id': 'cl84zf80d292q07wz9fmubwbw', 'global_key': '963716ae-2da4-4c43-93ae-c8f414a980ae', 'error': 'Invalid global key'}]}
Query data rows by global key
data_rows = []
global_keys = [str(uuid.uuid4()) for i in range(3)]
for i in range(3):
data_rows.append(
{
"row_data": "aaaa.jpg",
"global_key": global_keys[i]
}
)
task = dataset.create_data_rows(data_rows)
task.wait_till_done()
# this tests that the global keys were able to fetch all of the data rows we expect
res = client.get_data_row_ids_for_global_keys(global_keys)
print(res)
# ---
# {'status': 'SUCCESS', 'results': ['cl84zszs302z7074c8u7m2osx', 'cl84zszs302yz074ccg1acwu5', 'cl84zszs302z3074c6aahhq3q'], 'errors': []}
If you query for a non-existent global key, that particular query will fail while the valid queries execute.
# Partial Success example
global_keys.append("<a non-existent key>")
res = client.get_data_row_ids_for_global_keys(global_keys)
print(res)
# ---
# WARNING:labelbox.client:There are errors present. Please look at 'errors' in the returned dict for more details
# {'status': 'PARTIAL SUCCESS', 'results': ['cl84zszs302z7074c8u7m2osx', 'cl84zszs302yz074ccg1acwu5', 'cl84zszs302z3074c6aahhq3q'], 'errors': [{'global_key': '<a non-existent key>', 'error': 'Data Row not found'}]}
Now, you can add data row IDs to a batch for a project or a model run for other downstream tasks.
Global keys are cleared after deleting data rows
Global keys will be automatically deleted after the deletion of data rows. For example,
global_key = 'example_global_key'
new_data_row = {
"row_data": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg",
"external_id": str(uuid.uuid4()),
"global_key": global_key
}
# This task should succeed
task = dataset.create_data_rows([new_data_row])
task.wait_till_done()
print(task.errors) # None
# This task should fail
task = dataset.create_data_rows([new_data_row])
task.wait_till_done()
task.errors, task.result
# WARNING:labelbox.client:There are errors present. Please look at 'errors' in the returned dict for more details
# WARNING:labelbox.schema.task:There are errors present. Please look at `task.errors` for more details
---------------------------------------------------------------------------
#ValueError: Job failed. Errors : Duplicate global keys found: example_global_key
Since global keys are deleted upon data row deletion, you can check whether a global key is attached to a deleted data row, and it will show up in the "errors" field.
# delete data row by global key
duplicated_data_row = client.get_data_row(client.get_data_row_ids_for_global_keys([global_key])['results'][0])
duplicated_data_row.delete()
print('deleted')
# Should not have results
result = client.get_data_row_ids_for_global_keys(['example_global_key'])
print(result)
# WARNING:labelbox.client:There are errors present. Please look at 'errors' in the returned dict for more details
#{'status': 'FAILURE', 'results': [''], 'errors': [{'global_key': 'example_global_key', 'error': 'Data Row deleted'}]}
# You can now upload a Data Row with same global key again
# This should succeed
task = dataset.create_data_rows([new_data_row])
task.wait_till_done()
print(task.errors)