How to import annotations on document (PDF) data and sample import formats.
Open this Colab for an interactive tutorial on importing annotations on PDF data.
Supported annotations
To import annotations in Labelbox, you need to create the annotations payload. In this section, we provide this payload for every supported annotation type.
Labelbox supports two formats for the annotations payload:
- Python annotation types (recommended)
- NDJSON
Both are described below.
Entity (Page specific)
textSelections
is the payload required for each entity annotation. EachtextSelections
item in the list requires the following fields:
- The
groupId
associated with a group of words. - A list of
tokenIds
for each word in the group of words. - The
page
of the document (1-indexed).
Both tokenIds
and groupdId
are extracted from the text layer URL attached to the data row. Please follow the end-to-end demo to learn how to construct an entity annotation for documents.
entities_annotations = lb_types.ObjectAnnotation(
name="named_entity",
value= lb_types.DocumentEntity(
name="named_entity",
textSelections=[
lb_types.DocumentTextSelection(
token_ids=[],
group_id="",
page=1
)
]
)
)
entities_annotations_ndjson = {
"name": "named_entity",
"textSelections": [
{
"tokenIds": [
"<UUID>", ## ids associated with each word in a group
],
"groupId": "<UUID>", ## id associated with a group of words
"page": 1,
}
]
}
Classification: Radio (Single-choice, Global)
radio_annotation = lb_types.ClassificationAnnotation(
name="radio_question",
value=lb_types.Radio(answer =
lb_types.ClassificationAnswer(name = "first_radio_answer")
)
)
radio_annotation_ndjson = {
"name": "radio_question",
"answer": {"name": "first_radio_answer"}
}
Classification: Checklist (Multi-choice, Global)
checklist_annotation = lb_types.ClassificationAnnotation(
name="checklist_question",
value=lb_types.Checklist(answer = [
lb_types.ClassificationAnswer(name = "first_checklist_answer"),
lb_types.ClassificationAnswer(name = "second_checklist_answer")
])
)
checklist_annotation_ndjson = {
"name": "checklist_question",
"answer": [
{"name": "first_checklist_answer"},
{"name": "second_checklist_answer"}
]
}
Bounding box (Page specific)
bbox_annotation = lb_types.ObjectAnnotation(
name="bounding_box", # must match your ontology feature's name
value=lb_types.DocumentRectangle(
start=lb_types.Point(x=102.771, y=135.3), # x = left, y = top
end=lb_types.Point(x=518.571, y=245.143), # x = left + width , y = top + height
page=0,
unit=lb_types.RectangleUnit.POINTS
)
)
bbox_annotation_ndjson = {
"name": "bounding_box",
"bbox": {
"top": 135.3,
"left": 102.771,
"height": 109.843,
"width": 415.8
},
"page": 0,
"unit": "POINTS"
}
Classification: Checklist with nested classifications (Global)
nested_checklist_annotation = lb_types.ClassificationAnnotation(
name="nested_checklist_question",
value=lb_types.Checklist(
answer=[lb_types.ClassificationAnswer(
name="first_checklist_answer",
classifications=[
lb_types.ClassificationAnnotation(
name="sub_checklist_question",
value=lb_types.Checklist(
answer=[lb_types.ClassificationAnswer(
name="first_sub_checklist_answer"
)]
))
]
)]
)
)
nested_checklist_annotation_ndjson = {
"name": "nested_checklist_question",
"answer": [{
"name": "first_checklist_answer",
"classifications" : [
{
"name": "sub_checklist_question",
"answer": {"name": "first_sub_checklist_answer"}
}
]
}]
}
Classification: Radio with nested classifications (Global)
nested_radio_annotation = lb_types.ClassificationAnnotation(
name="nested_radio_question",
value=lb_types.Radio(
answer=lb_types.ClassificationAnswer(
name="first_radio_answer",
classifications=[
lb_types.ClassificationAnnotation(
name="sub_radio_question",
value=lb_types.Radio(
answer=lb_types.ClassificationAnswer(
name="first_sub_radio_answer"
)
)
)
]
)
)
)
nested_radio_annotation_ndjson = {
"name": "nested_radio_question",
"answer": {
"name": "first_radio_answer",
"classifications": [{
"name":"sub_radio_question",
"answer": { "name" : "first_sub_radio_answer"}
}]
}
}
Classification: Free-form text (Global)
text_annotation = lb_types.ClassificationAnnotation(
name="free_text", # must match your ontology feature"s name
value=lb_types.Text(answer="sample text")
)
text_annotation_ndjson = {
"name": "free_text",
"answer": "sample text"
}
Bounding box with nested classifications (Page specific)
bbox_with_radio_subclass_annotation = lb_types.ObjectAnnotation(
name="bbox_with_radio_subclass",
value=lb_types.DocumentRectangle(
start=lb_types.Point(x=317.271, y=226.757), # x = left, y = top
end=lb_types.Point(x=566.657, y=420.986), # x = left + width , y = top + height
unit=lb_types.RectangleUnit.POINTS,
page=1
),
classifications=[
lb_types.ClassificationAnnotation(
name="sub_radio_question",
value=lb_types.Radio(
answer=lb_types.ClassificationAnswer(
name="first_sub_radio_answer",
classifications=[
lb_types.ClassificationAnnotation(
name="second_sub_radio_question",
value=lb_types.Radio(
answer=lb_types.ClassificationAnswer(
name="second_sub_radio_answer"
)
)
)
]
)
)
)
]
)
bbox_with_radio_subclass_annotation_ndjson = {
"name": "bbox_with_radio_subclass",
"classifications": [
{
"name": "sub_radio_question",
"answer": {
"name": "first_sub_radio_answer",
"classifications": [
{
"name": "second_sub_radio_question",
"answer": {
"name": "second_sub_radio_answer"}
}
]
}
}
],
"bbox": {
"top": 226.757,
"left": 317.271,
"height": 194.229,
"width": 249.386
},
"page": 1,
"unit": "POINTS"
}
Entity with nested classifications (Page specific)
ner_with_checklist_subclass_annotation = lb_types.ObjectAnnotation(
name="ner_with_checklist_subclass",
value=lb_types.DocumentEntity(
name="ner_with_checklist_subclass",
text_selections=[
lb_types.DocumentTextSelection(
token_ids=[],
group_id="",
page=1
)
]
),
classifications=[
lb_types.ClassificationAnnotation(
name="sub_checklist_question",
value=lb_types.Checklist(
answer=[lb_types.ClassificationAnswer(name="first_sub_checklist_answer")]
)
)
]
)
ner_with_checklist_subclass_annotation_ndjson = {
"name": "ner_with_checklist_subclass",
"classifications":[
{
"name": "sub_checklist_question",
"answer": [{"name": "first_sub_checklist_answer"}]
}
],
"textSelections": [
{
"tokenIds": [
""
],
"groupId": "",
"page": 1
}
]
}
Relationships with Entity (Page specific)
Relationship annotations are only supported for MAL import jobs.
entity_source = lb_types.ObjectAnnotation(
name="named_entity",
value= lb_types.DocumentEntity(
name="named_entity",
textSelections=[
lb_types.DocumentTextSelection(
token_ids=[],
group_id="",
page=1
)
]
)
)
entity_target = lb_types.ObjectAnnotation(
name="named_entity",
value=lb_types.DocumentEntity(
name="named_entity",
textSelections=[
lb_types.DocumentTextSelection(
token_ids=[],
group_id="",
page=1
)
]
)
)
entity_relationship = lb_types.RelationshipAnnotation(
name="relationship",
value=lb_types.Relationship(
source=entity_source,
target=entity_target,
type=lb_types.Relationship.Type.UNIDIRECTIONAL,
))
uuid_source = str(uuid.uuid4())
uuid_target = str(uuid.uuid4())
entity_source_ndjson = {
"name": "named_entity",
"uuid": uuid_source,
"textSelections": [
{
"tokenIds": [
""
],
"groupId": "",
"page": 1
}
]
}
entity_target_ndjson = {
"name": "named_entity",
"uuid": uuid_target,
"textSelections": [
{
"tokenIds": [
""
],
"groupId": "",
"page": 1
}
]
}
ner_relationship_annotation_ndjson = {
"name": "relationship",
"relationship": {
"source": uuid_source, # UUID reference to source annotation
"target": uuid_target, # UUID reference to target annotation
"type": "unidirectional"
}
}
Relationships with bounding box (Page specific)
bbox_source = lb_types.ObjectAnnotation(
name="bounding_box",
value=lb_types.DocumentRectangle(
start=lb_types.Point(x=188.257, y=68.875), # x = left, y = top
end=lb_types.Point(x=270.907, y=149.556), # x = left + width , y = top + height
unit=lb_types.RectangleUnit.POINTS,
page=1
),
)
bbox_target = lb_types.ObjectAnnotation(
name="bounding_box",
value=lb_types.DocumentRectangle(
start=lb_types.Point(x=96.424, y=66.251),
end=lb_types.Point(x=179.074, y=146.932),
unit=lb_types.RectangleUnit.POINTS,
page=1
),
)
bbox_relationship = lb_types.RelationshipAnnotation(
name="relationship",
value=lb_types.Relationship(
source=bbox_source,
target=bbox_target,
type=lb_types.Relationship.Type.UNIDIRECTIONAL,
))
## Only supported for MAL imports
uuid_source_2 = str(uuid.uuid4())
uuid_target_2 = str(uuid.uuid4())
bbox_source_ndjson = {
"name": "bounding_box",
"uuid": uuid_source_2,
"bbox": {
"top": 68.875,
"left": 188.257,
"height": 80.681,
"width": 82.65
},
"page": 1,
"unit": "POINTS"
}
bbox_target_ndjson = {
"name": "bounding_box",
"uuid": uuid_target_2,
"bbox": {
"top": 66.251,
"left": 96.424,
"height": 80.681,
"width": 82.65
},
"page": 1,
"unit": "POINTS"
}
bbox_relationship_annotation_ndjson = {
"name": "relationship",
"relationship": {
"source": uuid_source_2, # UUID reference to source bbox annotation
"target": uuid_target_2, # UUID reference to target bbox annotation
"type": "unidirectional"
}
}
End-to-end example: Import pre-labels or ground truth
Whether you are importing annotations as pre-labels or as ground truth, the steps are very similar. Steps 5 and 6 (creating and importing the annotation payload) are where the process becomes slightly different and is explained below in detail.
Before you start
You must import these libraries to use the code examples in this section.
import uuid
import labelbox as lb
import labelbox.types as lb_types
from labelbox.schema.queue_mode import QueueMode
Replace with your API key
API_KEY = ""
client = lb.Client(api_key=API_KEY)
Step 1: Import data rows
To attach annotations to a data row, it must first be uploaded to Catalog. Here we create an example image data row in Catalog. If you would like to use entity annotations, you need to add a text layer URL. Removing the text layer URL of a labeled asset will make any entity annotations hidden in the UI.
Generate a text layer for your documents
To learn how to generate a text layer for your documents, please refer to the following repositories/files:
## Text layer url is required for uploading entity annotations
global_key = "0801.3483.pdf"
img_url = {
"row_data": {
"pdf_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf",
"text_layer_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"
},
"global_key": global_key
}
dataset = client.create_dataset(name="pdf_demo_dataset")
task = dataset.create_data_rows([img_url])
task.wait_till_done()
print("Errors:",task.errors)
print("Failed data rows:", task.failed_data_rows)
Step 2: Create an ontology
Your project should have the correct ontology set up with all the tools and classifications supported for your annotations. The value for the name parameter should match the name field in your annotations to ensure the correct feature schemas are matched.
Here is an example of creating an ontology programmatically for all the sample annotations above.
## Set up the ontology and link the tools created above.
ontology_builder = lb.OntologyBuilder(
classifications=[ # List of Classification objects
lb.Classification(
class_type=lb.Classification.Type.RADIO,
name="radio_question",
scope = lb.Classification.Scope.GLOBAL,
options=[
lb.Option(value="first_radio_answer"),
lb.Option(value="second_radio_answer")
]
),
lb.Classification(
class_type=lb.Classification.Type.CHECKLIST,
name="checklist_question",
scope = lb.Classification.Scope.GLOBAL,
options=[
lb.Option(value="first_checklist_answer"),
lb.Option(value="second_checklist_answer")
]
),
lb.Classification(
class_type=lb.Classification.Type.TEXT,
name="free_text",
scope = lb.Classification.Scope.GLOBAL
),
lb.Classification(
class_type=lb.Classification.Type.RADIO,
name="nested_radio_question",
scope = lb.Classification.Scope.GLOBAL,
options=[
lb.Option("first_radio_answer",
options=[
lb.Classification(
class_type=lb.Classification.Type.RADIO,
name="sub_radio_question",
options=[lb.Option("first_sub_radio_answer")]
)
])
]
),
lb.Classification(
class_type=lb.Classification.Type.CHECKLIST,
name="nested_checklist_question",
scope = lb.Classification.Scope.GLOBAL,
options=[
lb.Option("first_checklist_answer",
options=[
lb.Classification(
class_type=lb.Classification.Type.CHECKLIST,
name="sub_checklist_question",
options=[lb.Option("first_sub_checklist_answer")]
)
])
]
),
],
tools=[ # List of Tool objects
lb.Tool( tool=lb.Tool.Type.BBOX,name="bounding_box"),
lb.Tool(tool=lb.Tool.Type.NER, name="named_entity"),
lb.Tool(tool=lb.Tool.Type.RELATIONSHIP,name="relationship"),
lb.Tool(tool=lb.Tool.Type.NER,
name="ner_with_checklist_subclass",
classifications=[
lb.Classification(
class_type=lb.Classification.Type.CHECKLIST,
name="sub_checklist_question",
options=[
lb.Option(value="first_sub_checklist_answer")
]
)
]),
lb.Tool( tool=lb.Tool.Type.BBOX,
name="bbox_with_radio_subclass",
classifications=[
lb.Classification(
class_type=lb.Classification.Type.RADIO,
name="sub_radio_question",
options=[
lb.Option(
value="first_sub_radio_answer" ,
options=[
lb.Classification(
class_type=lb.Classification.Type.RADIO,
name="second_sub_radio_question",
options=[lb.Option("second_sub_radio_answer")]
)]
)]
)]
)]
)
ontology = client.create_ontology("Document Annotation Import Demo",
ontology_builder.asdict(),
media_type=lb.MediaType.Document)
Step 3: Create a labeling project
Create a project and connect the ontology created above
# Create a Labelbox project
project = client.create_project(name="PDF_annotation_demo",
queue_mode=QueueMode.Batch,
media_type=lb.MediaType.Document)
project.setup_editor(ontology)
Step 4: Send a batch of data rows to the project
project.create_batch(
"PDF_annotation_batch", # Each batch in a project must have a unique name
global_keys=[global_key] , # a list of global keys, data rows, or data row ids
priority=5 # priority between 1(highest) - 5(lowest)
)
Step 5: Create the annotation payload
First, we need to populate the text selections for the Entity annotations.
import requests
import json
# Helper method
def update_text_selections(annotation, group_id, list_tokens, page):
return annotation.update({
"textSelections": [
{
"groupId": group_id,
"tokenIds": list_tokens,
"page": page
}
]
})
text_layer = "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"
# Fetch the content of the text layer
res = requests.get(text_layer)
# Phrases that we want to annotation obtained from the text layer url
content_phrases = ["Metal-insulator (MI) transitions have been one of the" ,
"T. Sasaki,* N. Yoneyama, and N. Kobayashi",
"Organic charge transfer salts based on the donor",
"the experimental investigations on this issue have not"]
# Parse the text layer
text_selections = []
text_selections_ner = []
text_selections_source = []
text_selections_target = []
for obj in json.loads(res.text):
for group in obj["groups"]:
if group["content"] == content_phrases[0]:
list_tokens = [x["id"] for x in group["tokens"]]
# build text selections for Python Annotation Types
document_text_selection = lb_types.DocumentTextSelection(groupId=group["id"], tokenIds=list_tokens, page=1)
text_selections.append(document_text_selection)
# build text selection for the NDJson annotations
update_text_selections(annotation=entities_annotations_ndjson,
group_id=group["id"], # id representing group of words
list_tokens=list_tokens, # ids representing individual words from the group
page=1)
if group["content"] == content_phrases[1]:
list_tokens_2 = [x["id"] for x in group["tokens"]]
# build text selections for Python Annotation Types
ner_text_selection = lb_types.DocumentTextSelection(groupId=group["id"], tokenIds=list_tokens_2, page=1)
text_selections_ner.append(ner_text_selection)
# build text selection for the NDJson annotations
update_text_selections(annotation=ner_with_checklist_subclass_annotation_ndjson,
group_id=group["id"], # id representing group of words
list_tokens=list_tokens_2, # ids representing individual words from the group
page=1)
if group["content"] == content_phrases[2]:
relationship_source = [x["id"] for x in group["tokens"]]
# build text selections for Python Annotation Types
text_selection_entity_source = lb_types.DocumentTextSelection(groupId=group["id"], tokenIds=relationship_source, page=1)
text_selections_source.append(text_selection_entity_source)
# build text selection for the NDJson annotations
update_text_selections(annotation=entity_source_ndjson,
group_id=group["id"], # id representing group of words
list_tokens=relationship_source, # ids representing individual words from the group
page=1)
if group["content"] == content_phrases[3]:
relationship_target = [x["id"] for x in group["tokens"]]
# build text selections for Python Annotation Types
text_selection_entity_target = lb_types.DocumentTextSelection(group_id=group["id"], tokenIds=relationship_target, page=1)
text_selections_target.append(text_selection_entity_target)
# build text selections forthe NDJson annotations
update_text_selections(annotation=entity_target_ndjson,
group_id=group["id"], # id representing group of words
list_tokens=relationship_target, # ids representing individual words from the group
page=1)
Re-write the Python annotations to include text selections (only required for Python annotation types)
# re-write the entity annotation with text selections
entities_annotation_document_entity = lb_types.DocumentEntity(name="named_entity", textSelections = text_selections)
entities_annotation = lb_types.ObjectAnnotation(name="named_entity",value=entities_annotation_document_entity)
# re-write the entity annotation + subclassification with text selections
classifications = [
lb_types.ClassificationAnnotation(
name="sub_checklist_question",
value=lb_types.Checklist(
answer=[lb_types.ClassificationAnswer(name="first_sub_checklist_answer")]
)
)
]
ner_annotation_with_subclass = lb_types.DocumentEntity(name="ner_with_checklist_subclass", textSelections= text_selections_ner)
ner_with_checklist_subclass_annotation = lb_types.ObjectAnnotation(name="ner_with_checklist_subclass",
value=ner_annotation_with_subclass,
classifications=classifications)
# re-write the entity source and target annotations withe text selectios
entity_source_doc = lb_types.DocumentEntity(name="named_entity", text_selections= text_selections_source)
entity_source = lb_types.ObjectAnnotation(name="named_entity", value=entity_source_doc)
entity_target_doc = lb_types.DocumentEntity(name="named_entity", text_selections=text_selections_target)
entity_target = lb_types.ObjectAnnotation(name="named_entity", value=entity_target_doc)
# re-write the entity relationship with the re-created entities
entity_relationship = lb_types.RelationshipAnnotation(
name="relationship",
value=lb_types.Relationship(
source=entity_source,
target=entity_target,
type=lb_types.Relationship.Type.UNIDIRECTIONAL,
))
print(f"entities_annotations_ndjson={entities_annotations_ndjson}")
print(f"entities_annotation={entities_annotation}")
print(f"nested_entities_annotation_ndjson={ner_with_checklist_subclass_annotation_ndjson}")
print(f"nested_entities_annotation={ner_with_checklist_subclass_annotation}")
print(f"entity_source_ndjson={entity_source_ndjson}")
print(f"entity_target_ndjson={entity_target_ndjson}")
print(f"entity_source={entity_source}")
print(f"entity_target={entity_target}")
Create the annotations payload using the snippets of code shown above.
Labelbox supports two formats for the annotations payload: NDJSON and Python annotation types. Both approaches are described below with instructions to compose annotations into Labels attached to the data rows.
The resulting label_ndjson
and labels
from each approach will include every annotation (created above) supported by the respective method.
# create a Label
labels = []
labels.append(
lb_types.Label(
data=lb_types.DocumentData(
global_key=global_key),
annotations = [
entities_annotation,
checklist_annotation,
nested_checklist_annotation,
text_annotation,
radio_annotation,
nested_radio_annotation,
bbox_annotation,
bbox_with_radio_subclass_annotation,
ner_with_checklist_subclass_annotation,
entity_source,
entity_target,
entity_relationship,# Only supported for MAL imports
bbox_source,
bbox_target,
bbox_relationship # Only supported for MAL imports
]
)
)
label_ndjson = []
for annot in [
entities_annotations_ndjson,
checklist_annotation_ndjson,
nested_checklist_annotation_ndjson,
text_annotation_ndjson,
radio_annotation_ndjson,
nested_radio_annotation_ndjson,
bbox_annotation_ndjson,
bbox_with_radio_subclass_annotation_ndjson,
ner_with_checklist_subclass_annotation_ndjson,
entity_source_ndjson,
entity_target_ndjson,
ner_relationship_annotation_ndjson, # Only supported for MAL imports
bbox_source_ndjson,
bbox_target_ndjson,
bbox_relationship_annotation_ndjson # Only supported for MAL imports
]:
annot.update({
"dataRow": {"globalKey": global_key},
})
label_ndjson.append(annot)
Step 6: Import the annotation payload
For both options, you can pass either the label_ndjson
and labels
payload as the value for the predictions or labels parameter.
Option A: Upload to a labeling project as pre-labels (Model-assisted labeling)
upload_job = lb.MALPredictionImport.create_from_objects(
client = client,
project_id = project.uid,
name="pdf_annotation_upload" + str(uuid.uuid4()),
predictions=labels)
upload_job.wait_until_done()
# Errors will appear for annotation uploads that failed.
print("Errors:", upload_job.errors)
print("Status of uploads: ", upload_job.statuses)
Option B: Upload to a labeling project as ground truth
Relationship annotations are not supported in label import jobs
upload_job = lb.LabelImport.create_from_objects(
client = client,
project_id = project.uid,
name="label_import_job"+str(uuid.uuid4()),
labels=labels)
print("Errors:", upload_job.errors)
print("Status of uploads: ", upload_job.statuses)