Import document annotations

How to import annotations on document (PDF) data and sample import formats.

Open this Colab for an interactive tutorial on importing annotations on PDF data.

Supported annotations

To import annotations in Labelbox, you need to create the annotations payload. In this section, we provide this payload for every supported annotation type.

Labelbox supports two formats for the annotations payload:

  • Python annotation types (recommended)
  • NDJSON

Both are described below.

Entity (Page specific)

textSelections is the payload required for each entity annotation. EachtextSelections item in the list requires the following fields:

  • ThegroupId associated with a group of words.
  • A list of tokenIds for each word in the group of words.
  • The page of the document (1-indexed).

Both tokenIds and groupdId are extracted from the text layer URL attached to the data row. Please follow the end-to-end demo to learn how to construct an entity annotation for documents.

entities_annotations = lb_types.ObjectAnnotation(
    name="named_entity",
    value= lb_types.DocumentEntity(
        name="named_entity",
        textSelections=[
            lb_types.DocumentTextSelection(
                token_ids=[],
                group_id="",
                page=1
            )
        ]
    )
)
entities_annotations_ndjson = { 
    "name": "named_entity",
    "textSelections": [
        {
            "tokenIds": [
                "<UUID>", ## ids associated with each word in a group
            ],
            "groupId": "<UUID>", ## id associated with a group of words
            "page": 1,
        }
    ]
}

Classification: Radio (Single-choice, Global)

radio_annotation = lb_types.ClassificationAnnotation(
    name="radio_question",
    value=lb_types.Radio(answer = 
        lb_types.ClassificationAnswer(name = "first_radio_answer")
    )
)
radio_annotation_ndjson = {
  "name": "radio_question",
  "answer": {"name": "first_radio_answer"}
}

Classification: Checklist (Multi-choice, Global)

checklist_annotation = lb_types.ClassificationAnnotation(
    name="checklist_question",
    value=lb_types.Checklist(answer = [
        lb_types.ClassificationAnswer(name = "first_checklist_answer"),
        lb_types.ClassificationAnswer(name = "second_checklist_answer")
    ])
  )
checklist_annotation_ndjson = {
  "name": "checklist_question",
  "answer": [
    {"name": "first_checklist_answer"},
    {"name": "second_checklist_answer"}
  ]
}

Bounding box (Page specific)

bbox_annotation = lb_types.ObjectAnnotation(
    name="bounding_box",  # must match your ontology feature's name
    value=lb_types.DocumentRectangle(
        start=lb_types.Point(x=102.771, y=135.3),  # x = left, y = top 
        end=lb_types.Point(x=518.571, y=245.143),  # x = left + width , y = top + height
        page=0,
        unit=lb_types.RectangleUnit.POINTS
        )
    )
bbox_annotation_ndjson = {
  "name": "bounding_box",
  "bbox": {
            "top": 135.3,
            "left": 102.771,
            "height": 109.843,
            "width": 415.8
      },
  "page": 0,
  "unit": "POINTS"
}

Classification: Checklist with nested classifications (Global)

nested_checklist_annotation = lb_types.ClassificationAnnotation(
  name="nested_checklist_question",
  value=lb_types.Checklist(
    answer=[lb_types.ClassificationAnswer(
      name="first_checklist_answer",
      classifications=[
        lb_types.ClassificationAnnotation(
          name="sub_checklist_question",
          value=lb_types.Checklist(
            answer=[lb_types.ClassificationAnswer(
            name="first_sub_checklist_answer"
          )]
        ))
      ]
    )]
  )
)
nested_checklist_annotation_ndjson = {
  "name": "nested_checklist_question",
  "answer": [{
      "name": "first_checklist_answer", 
      "classifications" : [
        {
          "name": "sub_checklist_question", 
          "answer": {"name": "first_sub_checklist_answer"}
        }          
      ]         
  }]
}

Classification: Radio with nested classifications (Global)

nested_radio_annotation = lb_types.ClassificationAnnotation(
  name="nested_radio_question",
  value=lb_types.Radio(
    answer=lb_types.ClassificationAnswer(
      name="first_radio_answer",
      classifications=[
        lb_types.ClassificationAnnotation(
          name="sub_radio_question",
          value=lb_types.Radio(
            answer=lb_types.ClassificationAnswer(
              name="first_sub_radio_answer"
            )
          )
        )
      ]
    )
  )
)
nested_radio_annotation_ndjson = {
  "name": "nested_radio_question",
  "answer": {
      "name": "first_radio_answer",
      "classifications": [{
          "name":"sub_radio_question",
          "answer": { "name" : "first_sub_radio_answer"}
        }]
    }
}

Classification: Free-form text (Global)

text_annotation = lb_types.ClassificationAnnotation(
  name="free_text",  # must match your ontology feature"s name
  value=lb_types.Text(answer="sample text")
)

text_annotation_ndjson = {
  "name": "free_text",
  "answer": "sample text"
}

Bounding box with nested classifications (Page specific)

bbox_with_radio_subclass_annotation = lb_types.ObjectAnnotation(
    name="bbox_with_radio_subclass",
    value=lb_types.DocumentRectangle(
        start=lb_types.Point(x=317.271, y=226.757), # x = left, y = top 
        end=lb_types.Point(x=566.657, y=420.986), # x = left + width , y = top + height
        unit=lb_types.RectangleUnit.POINTS,
        page=1
    ),
    classifications=[
    	lb_types.ClassificationAnnotation(
        	name="sub_radio_question",
      		value=lb_types.Radio(
          answer=lb_types.ClassificationAnswer(
            name="first_sub_radio_answer",
            classifications=[
              lb_types.ClassificationAnnotation(
                name="second_sub_radio_question",
                value=lb_types.Radio(
                  answer=lb_types.ClassificationAnswer(
                    name="second_sub_radio_answer"
                  )
                )
              )
            ]
          )
          )
        )
    ]
)
bbox_with_radio_subclass_annotation_ndjson = {
  "name": "bbox_with_radio_subclass",
  "classifications": [
    {
      "name": "sub_radio_question", 
      "answer": {
          "name": "first_sub_radio_answer", 
          "classifications": [
              {
                  "name": "second_sub_radio_question", 
                  "answer": {
                      "name": "second_sub_radio_answer"}
               }
            ]
        }
    }
  ],
  "bbox": {
        "top": 226.757,
        "left": 317.271,
        "height": 194.229,
        "width": 249.386
    },
  "page": 1,
  "unit": "POINTS"
}

Entity with nested classifications (Page specific)

ner_with_checklist_subclass_annotation = lb_types.ObjectAnnotation(
  name="ner_with_checklist_subclass",
  value=lb_types.DocumentEntity(
    name="ner_with_checklist_subclass",
    text_selections=[
      lb_types.DocumentTextSelection(
        token_ids=[],
        group_id="",
        page=1
      )
    ]
  ),
  classifications=[
    lb_types.ClassificationAnnotation(
      name="sub_checklist_question",
      value=lb_types.Checklist(
      answer=[lb_types.ClassificationAnswer(name="first_sub_checklist_answer")]
      )
    )
  ]
)
ner_with_checklist_subclass_annotation_ndjson = {
  "name": "ner_with_checklist_subclass",
  "classifications":[
    {
      "name": "sub_checklist_question",
      "answer": [{"name": "first_sub_checklist_answer"}] 
    }
  ],
  "textSelections": [
      {
          "tokenIds": [
              ""
          ],
          "groupId": "",
          "page": 1
      }
  ] 
}

Relationships with Entity (Page specific)

Relationship annotations are only supported for MAL import jobs.

entity_source = lb_types.ObjectAnnotation(
    name="named_entity",
    value= lb_types.DocumentEntity(
        name="named_entity",
        textSelections=[
            lb_types.DocumentTextSelection(
                token_ids=[],
                group_id="",
                page=1
            )
        ]
    )
)

entity_target = lb_types.ObjectAnnotation(
    name="named_entity",
    value=lb_types.DocumentEntity(
      name="named_entity",
      textSelections=[
        lb_types.DocumentTextSelection(
          token_ids=[],
          group_id="",
          page=1
        )
      ]
    )
)

entity_relationship = lb_types.RelationshipAnnotation(
    name="relationship",
    value=lb_types.Relationship(
        source=entity_source,
        target=entity_target,
        type=lb_types.Relationship.Type.UNIDIRECTIONAL,
    ))

uuid_source = str(uuid.uuid4())
uuid_target = str(uuid.uuid4())

entity_source_ndjson = {
  "name": "named_entity",
  "uuid": uuid_source,
  "textSelections": [
    {
      "tokenIds": [
        ""
      ],
      "groupId": "",
      "page": 1
    }
  ]
  
}

entity_target_ndjson = {
  "name": "named_entity",
  "uuid": uuid_target,
  "textSelections": [
    {
      "tokenIds": [
        ""
      ],
      "groupId": "",
      "page": 1
    }
  ]
}
ner_relationship_annotation_ndjson = {
    "name": "relationship", 
    "relationship": {
      "source": uuid_source, # UUID reference to source annotation 
      "target": uuid_target, # UUID reference to target annotation
      "type": "unidirectional"
    }
}

Relationships with bounding box (Page specific)

bbox_source = lb_types.ObjectAnnotation(
    name="bounding_box",
    value=lb_types.DocumentRectangle(
        start=lb_types.Point(x=188.257, y=68.875), # x = left, y = top 
        end=lb_types.Point(x=270.907, y=149.556), # x = left + width , y = top + height
        unit=lb_types.RectangleUnit.POINTS,
        page=1
    ),
)

bbox_target = lb_types.ObjectAnnotation(
    name="bounding_box",
    value=lb_types.DocumentRectangle(
        start=lb_types.Point(x=96.424, y=66.251),
        end=lb_types.Point(x=179.074, y=146.932),
        unit=lb_types.RectangleUnit.POINTS,
        page=1
    ),
)

bbox_relationship = lb_types.RelationshipAnnotation(
    name="relationship",
    value=lb_types.Relationship(
        source=bbox_source,
        target=bbox_target,
        type=lb_types.Relationship.Type.UNIDIRECTIONAL,
    ))
## Only supported for MAL imports 
uuid_source_2 = str(uuid.uuid4())
uuid_target_2 = str(uuid.uuid4())

bbox_source_ndjson = {
  "name": "bounding_box",
  "uuid": uuid_source_2,
  "bbox":  {
            "top": 68.875,
            "left": 188.257,
            "height": 80.681,
            "width": 82.65
        },
  "page": 1,
  "unit": "POINTS"
}

bbox_target_ndjson = {
  "name": "bounding_box",
  "uuid": uuid_target_2,
  "bbox":  {
            "top": 66.251,
            "left": 96.424,
            "height": 80.681,
            "width": 82.65
        },
  "page": 1,
  "unit": "POINTS"
}

bbox_relationship_annotation_ndjson = {
    "name": "relationship", 
    "relationship": {
      "source": uuid_source_2, # UUID reference to source bbox annotation
      "target": uuid_target_2, # UUID reference to target bbox annotation
      "type": "unidirectional"
    }
}

End-to-end example: Import pre-labels or ground truth

Whether you are importing annotations as pre-labels or as ground truth, the steps are very similar. Steps 5 and 6 (creating and importing the annotation payload) are where the process becomes slightly different and is explained below in detail.

Before you start

You must import these libraries to use the code examples in this section.

import uuid
import json
import labelbox as lb
import labelbox.types as lb_types

Replace with your API key

API_KEY = ""
client = lb.Client(api_key=API_KEY)

Step 1: Import data rows

Here, we create an example document data row in Catalog. If you want to use entity annotations, you may include a text layer URL; if you do not, Labelbox will generate a text layer URL using Google Document AI.

## Text layer url is required for uploading entity annotations
global_key = "0801.3483.pdf"
img_url = {
    "row_data": {
      "pdf_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf",
    },
    "global_key": global_key
}


dataset = client.create_dataset(name="pdf_demo_dataset")
task = dataset.create_data_rows([img_url])
task.wait_till_done()
print(f"Failed data rows: {task.failed_data_rows}")
print(f"Errors: {task.errors}")

if task.errors:
    for error in task.errors:
        if 'Duplicate global key' in error['message'] and dataset.row_count == 0:
            # If the global key already  exists in the workspace the dataset will be created empty, so we can delete it.
            print(f"Deleting empty dataset: {dataset}")
            dataset.delete()

Step 2: Create an ontology

Your project should have the correct ontology set up with all the tools and classifications supported for your annotations. The value for the name parameter should match the name field in your annotations to ensure the correct feature schemas are matched.

Here is an example of creating an ontology programmatically for all the sample annotations above.

## Set up the ontology and link the tools created above.

ontology_builder = lb.OntologyBuilder(
  classifications=[ # List of Classification objects
    lb.Classification( 
      class_type=lb.Classification.Type.RADIO,
      name="radio_question", 
      scope = lb.Classification.Scope.GLOBAL,
      options=[
        lb.Option(value="first_radio_answer"),
        lb.Option(value="second_radio_answer")
      ]
    ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="checklist_question", 
      scope = lb.Classification.Scope.GLOBAL,
      options=[
        lb.Option(value="first_checklist_answer"),
        lb.Option(value="second_checklist_answer")
      ]
    ), 
    lb.Classification(
      class_type=lb.Classification.Type.TEXT,
      name="free_text",
      scope = lb.Classification.Scope.GLOBAL
    ),
    lb.Classification(
        class_type=lb.Classification.Type.RADIO,
        name="nested_radio_question",
        scope = lb.Classification.Scope.GLOBAL,
        options=[
            lb.Option("first_radio_answer",
                options=[
                    lb.Classification(
                        class_type=lb.Classification.Type.RADIO,
                        name="sub_radio_question",
                        options=[lb.Option("first_sub_radio_answer")]
                    )
                ])
          ]
    ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="nested_checklist_question",
      scope = lb.Classification.Scope.GLOBAL,
      options=[
          lb.Option("first_checklist_answer",
            options=[
              lb.Classification(
                  class_type=lb.Classification.Type.CHECKLIST,
                  name="sub_checklist_question", 
                  options=[lb.Option("first_sub_checklist_answer")]
              )
          ])
      ]
    ),      
  ],
  tools=[ # List of Tool objects
    lb.Tool( tool=lb.Tool.Type.BBOX,name="bounding_box"), 
    lb.Tool(tool=lb.Tool.Type.NER, name="named_entity"),
    lb.Tool(tool=lb.Tool.Type.RELATIONSHIP,name="relationship"),
    lb.Tool(tool=lb.Tool.Type.NER,
            name="ner_with_checklist_subclass",
            classifications=[
              lb.Classification(
                class_type=lb.Classification.Type.CHECKLIST,
                name="sub_checklist_question",
                options=[
                  lb.Option(value="first_sub_checklist_answer")
                ]
              )
          ]),
    lb.Tool( tool=lb.Tool.Type.BBOX,
            name="bbox_with_radio_subclass",
            classifications=[
              lb.Classification(
                  class_type=lb.Classification.Type.RADIO,
                  name="sub_radio_question",
                  options=[
                    lb.Option(
                      value="first_sub_radio_answer" ,
                      options=[
                        lb.Classification(
                          class_type=lb.Classification.Type.RADIO,
                          name="second_sub_radio_question",
                          options=[lb.Option("second_sub_radio_answer")]
                        )]
                    )]
                )]
      )]
)

ontology = client.create_ontology("Document Annotation Import Demo",
                                  ontology_builder.asdict(),
                                  media_type=lb.MediaType.Document)

Step 3: Create a labeling project

Create a project and connect the ontology created above

# Create a Labelbox project
project = client.create_project(name="PDF_annotation_demo",                                    
                                    queue_mode=QueueMode.Batch,
                                    media_type=lb.MediaType.Document)
project.setup_editor(ontology)

Step 4: Send a batch of data rows to the project

project.create_batch(
  "PDF_annotation_batch", # Each batch in a project must have a unique name
  global_keys=[global_key] , # a list of global keys, data rows, or data row ids
  priority=5 # priority between 1(highest) - 5(lowest)
)

Step 5: Create the annotation payload

To import ner annotations, you can either pass atext_layer_url or use Labelbox-generated text_layer_url.

To extract Labelbox generated text_layer_url we first need to export the data row.

client.enable_experimental = True
task = lb.DataRow.export(client=client,global_keys=[global_key])
task.wait_till_done()
stream = task.get_stream()

text_layer = ""
for output in stream:
    output_json = json.loads(output.json_str)
    text_layer = output_json['media_attributes']['text_layer_url']
print(text_layer)

import requests
import json

# Helper method
def update_text_selections(annotation, group_id, list_tokens, page):
  return annotation.update({
    "textSelections": [
      {
        "groupId": group_id,
        "tokenIds": list_tokens,
        "page": page
      }
    ]
  })
  
# Fetch the content of the text layer
res = requests.get(text_layer) 

# Phrases that we want to annotation obtained from the text layer url
content_phrases = ["Metal-insulator (MI) transitions have been one of the" ,
                   "T. Sasaki, N. Yoneyama, and N. Kobayashi",, 
                   "Organic charge transfer salts based on the donor",
                   "the experimental investigations on this issue have not"]

# Parse the text layer
text_selections = []
text_selections_ner = []
text_selections_source = []
text_selections_target = []

for obj in json.loads(res.text):
  for group in obj["groups"]:
    if group["content"] == content_phrases[0]:
      list_tokens = [x["id"] for x in group["tokens"]]
      # build text selections for Python Annotation Types
      document_text_selection = lb_types.DocumentTextSelection(groupId=group["id"], tokenIds=list_tokens, page=1)
      text_selections.append(document_text_selection)
      # build text selection for the NDJson annotations
      update_text_selections(annotation=entities_annotations_ndjson,
                             group_id=group["id"], # id representing group of words 
                             list_tokens=list_tokens, # ids representing individual words from the group
                             page=1)
    if group["content"] == content_phrases[1]:
      list_tokens_2 = [x["id"] for x in group["tokens"]]
      # build text selections for Python Annotation Types
      ner_text_selection = lb_types.DocumentTextSelection(groupId=group["id"], tokenIds=list_tokens_2, page=1)
      text_selections_ner.append(ner_text_selection)
      # build text selection for the NDJson annotations
      update_text_selections(annotation=ner_with_checklist_subclass_annotation_ndjson,
                             group_id=group["id"], # id representing group of words 
                             list_tokens=list_tokens_2, # ids representing individual words from the group
                             page=1)
    if group["content"] == content_phrases[2]:
      relationship_source = [x["id"] for x in group["tokens"]]
      # build text selections for Python Annotation Types
      text_selection_entity_source = lb_types.DocumentTextSelection(groupId=group["id"], tokenIds=relationship_source, page=1)
      text_selections_source.append(text_selection_entity_source)
      # build text selection for the NDJson annotations
      update_text_selections(annotation=entity_source_ndjson,
                             group_id=group["id"], # id representing group of words 
                             list_tokens=relationship_source, # ids representing individual words from the group
                             page=1)
    if group["content"] == content_phrases[3]:
        relationship_target = [x["id"] for x in group["tokens"]]
        # build text selections for Python Annotation Types
        text_selection_entity_target =  lb_types.DocumentTextSelection(group_id=group["id"], tokenIds=relationship_target, page=1)
        text_selections_target.append(text_selection_entity_target)
        # build text selections forthe NDJson annotations
        update_text_selections(annotation=entity_target_ndjson,
                               group_id=group["id"], # id representing group of words 
                               list_tokens=relationship_target, # ids representing individual words from the group
                               page=1)
  

Re-write the Python annotations to include text selections (only required for Python annotation types)

# re-write the entity annotation with text selections 
entities_annotation_document_entity = lb_types.DocumentEntity(name="named_entity", textSelections = text_selections)
entities_annotation = lb_types.ObjectAnnotation(name="named_entity",value=entities_annotation_document_entity)

# re-write the entity annotation + subclassification with text selections 
classifications = [
    lb_types.ClassificationAnnotation(
      name="sub_checklist_question",
      value=lb_types.Checklist(
      answer=[lb_types.ClassificationAnswer(name="first_sub_checklist_answer")]
      )
    )
  ]
ner_annotation_with_subclass = lb_types.DocumentEntity(name="ner_with_checklist_subclass", textSelections= text_selections_ner)
ner_with_checklist_subclass_annotation = lb_types.ObjectAnnotation(name="ner_with_checklist_subclass", 
                                                                   value=ner_annotation_with_subclass, 
                                                                   classifications=classifications)

# re-write the entity source and target annotations withe text selectios
entity_source_doc = lb_types.DocumentEntity(name="named_entity", text_selections= text_selections_source)
entity_source = lb_types.ObjectAnnotation(name="named_entity", value=entity_source_doc)

entity_target_doc = lb_types.DocumentEntity(name="named_entity",  text_selections=text_selections_target)
entity_target = lb_types.ObjectAnnotation(name="named_entity", value=entity_target_doc)

# re-write the entity relationship with the re-created entities 
entity_relationship = lb_types.RelationshipAnnotation(
    name="relationship",
    value=lb_types.Relationship(
        source=entity_source,
        target=entity_target,
        type=lb_types.Relationship.Type.UNIDIRECTIONAL,
    ))


print(f"entities_annotations_ndjson={entities_annotations_ndjson}")
print(f"entities_annotation={entities_annotation}")
print(f"nested_entities_annotation_ndjson={ner_with_checklist_subclass_annotation_ndjson}")
print(f"nested_entities_annotation={ner_with_checklist_subclass_annotation}")
print(f"entity_source_ndjson={entity_source_ndjson}")
print(f"entity_target_ndjson={entity_target_ndjson}")
print(f"entity_source={entity_source}")
print(f"entity_target={entity_target}")
  

Create the annotations payload using the snippets of code shown above.

Labelbox supports two formats for the annotations payload: NDJSON and Python annotation types. Both approaches are described below with instructions to compose annotations into Labels attached to the data rows.

The resulting label_ndjson and labels from each approach will include every annotation (created above) supported by the respective method.

# create a Label
labels = []

labels.append(
    lb_types.Label(
        data=lb_types.DocumentData(
            global_key=global_key),
        annotations = [
            entities_annotation,
            checklist_annotation,
            nested_checklist_annotation, 
            text_annotation,
            radio_annotation,
            nested_radio_annotation,
            bbox_annotation,
            bbox_with_radio_subclass_annotation,
            ner_with_checklist_subclass_annotation,
            entity_source, 
            entity_target, 
            entity_relationship,# Only supported for MAL imports 
            bbox_source,
            bbox_target,
            bbox_relationship  # Only supported for MAL imports 
        ]
  )
)

label_ndjson = []
for annot in [
    entities_annotations_ndjson,
    checklist_annotation_ndjson,
    nested_checklist_annotation_ndjson,
    text_annotation_ndjson,
    radio_annotation_ndjson,
    nested_radio_annotation_ndjson,
    bbox_annotation_ndjson,
    bbox_with_radio_subclass_annotation_ndjson,
    ner_with_checklist_subclass_annotation_ndjson,
    entity_source_ndjson, 
    entity_target_ndjson, 
    ner_relationship_annotation_ndjson, # Only supported for MAL imports 
    bbox_source_ndjson,
    bbox_target_ndjson,
    bbox_relationship_annotation_ndjson # Only supported for MAL imports    
  ]:
  annot.update({
      "dataRow": {"globalKey": global_key},
  })
  label_ndjson.append(annot)


Step 6: Import the annotation payload

For both options, you can pass either the label_ndjson and labels payload as the value for the predictions or labels parameter.

Option A: Upload to a labeling project as pre-labels (Model-assisted labeling)

upload_job = lb.MALPredictionImport.create_from_objects(
    client = client,
    project_id = project.uid,
    name="pdf_annotation_upload" + str(uuid.uuid4()),
    predictions=labels)

upload_job.wait_until_done()
# Errors will appear for annotation uploads that failed.
print("Errors:", upload_job.errors)
print("Status of uploads: ", upload_job.statuses)

Option B: Upload to a labeling project as ground truth

📘

Relationship annotations are not supported in label import jobs

upload_job = lb.LabelImport.create_from_objects(
    client = client, 
    project_id = project.uid, 
    name="label_import_job"+str(uuid.uuid4()),  
    labels=labels)

print("Errors:", upload_job.errors)
print("Status of uploads: ", upload_job.statuses)