Import text annotations

How to import annotations on text data and sample import formats.

Open this Colab for an interactive tutorial on importing annotations on text assets.

Supported annotations

To import annotations in Labelbox, you need to create the annotations payload. In this section, we provide this payload for every annotation type.

Labelbox supports two formats for the annotations payload:

  • Python annotation types (recommended)
  • NDJSON

Both are described below.

Entity

named_entity = lb_types.TextEntity(start=10, end=20)
named_entitity_annotation = lb_types.ObjectAnnotation(value=named_entity, name = "named_entity")
entities_ndjson = { 
    "name": "named_entity",
    "location": { 
        "start": 67, 
        "end": 128 
    }
}

Classification: Radio (single-choice)

radio_annotation = lb_types.ClassificationAnnotation(
    name="radio_question",
    value=lb_types.Radio(answer = 
        lb_types.ClassificationAnswer(name = "first_radio_answer")
    )
)
radio_annotation_ndjson = {
  "name": "radio_question",
  "answer": {"name": "first_radio_answer"}
}

Classification: Checklist (Multi-choice)

checklist_annotation = lb_types.ClassificationAnnotation(
    name="checklist_question",
    value=lb_types.Checklist(answer = [
        lb_types.ClassificationAnswer(name = "first_checklist_answer"),
        lb_types.ClassificationAnswer(name = "second_checklist_answer"),
        lb_types.ClassificationAnswer(name = "third_checklist_answer")
    ])
  )
checklist_annotation_ndjson = {
  "name": "checklist_question",
  "answer": [
    {"name": "first_checklist_answer"},
    {"name": "second_checklist_answer"},
    {"name": "third_checklist_answer"},
  ]
}

Classification: Radio with nested classifications

nested_radio_annotation = lb_types.ClassificationAnnotation(
  name="nested_radio_question",
  value=lb_types.Radio(
    answer=lb_types.ClassificationAnswer(
      name="first_radio_answer",
      classifications=[
        lb_types.ClassificationAnnotation(
          name="sub_radio_question",
          value=lb_types.Radio(
            answer=lb_types.ClassificationAnswer(
              name="first_sub_radio_answer"
            )
          )
        )
      ]
    )
  )
)
nested_radio_annotation_ndjson= {
  'name': 'nested_radio_question',
  'answer': {
      'name': 'first_radio_answer',
      'classifications': [{
          'name':'sub_radio_question',
          'answer': { 'name' : 'first_sub_radio_answer'}
        }]
    }
}

Classification: Checklist with nested classifications

nested_checklist_annotation = lb_types.ClassificationAnnotation(
  name="nested_checklist_question",
  value=lb_types.Checklist(
    answer=[lb_types.ClassificationAnswer(
      name="first_checklist_answer",
      classifications=[
        lb_types.ClassificationAnnotation(
          name="sub_checklist_question",
          value=lb_types.Checklist(
            answer=[lb_types.ClassificationAnswer(
            name="first_sub_checklist_answer"
          )]
        ))
      ]
    )]
  )
)
nested_checklist_annotation_ndjson = {
  "name": "nested_checklist_question",
  "answer": [{
      "name": "first_checklist_answer", 
      "classifications" : [
        {
          "name": "sub_checklist_question", 
          "answer": {"name": "first_sub_checklist_answer"}
        }          
      ]         
  }]
}

Classification: Free-form text

text_annotation = lb_types.ClassificationAnnotation(
    name = "free_text", 
    value = lb_types.Text(answer="sample text")
)
text_annotation_ndjson = {
  "name": "free_text",
  "answer": "sample text",
}

Relationship with NER

Relationship annotations are only supported for MAL import jobs.

ner_source = lb_types.ObjectAnnotation(
    name="named_entity",
    value=lb_types.TextEntity(
      start=133, 
      end=140
    )
)

ner_target = lb_types.ObjectAnnotation(
    name="named_entity",
    value=lb_types.TextEntity(
      start=143,
      end=159
    )
)

ner_relationship = lb_types.RelationshipAnnotation(
    name="relationship",
    value=lb_types.Relationship(
        source=ner_source, # UUID is not required for annotation types 
        target=ner_target, 
        type=lb_types.Relationship.Type.UNIDIRECTIONAL,
    ))
uuid_source = str(uuid.uuid4())
uuid_target = str(uuid.uuid4())

entity_source_ndjson = {
  "name": "named_entity",
  "uuid": uuid_source, 
  "location": {
          "start" : 133,
          "end": 140          
      }
}

entity_target_ndjson = {
  "name": "named_entity",
  "uuid": uuid_target,
  "location": {
    "start": 143,
    "end": 159
  }
}

ner_relationship_annotation_ndjson = {
    "name": "relationship", 
    "relationship": {
      "source": uuid_source, # UUID reference to entity source annotation
      "target": uuid_target, # UUID reference to target source annotation 
      "type": "unidirectional"
    }
}

End-to-end example: Import pre-labels or ground truth

Whether you are importing annotations as pre-labels or as ground truth, the steps are very similar. Step 6 (importing the annotation payload) is where the process becomes slightly different and is explained below in detail.

Before you start

You will need to import these libraries to use the code examples in this section.

import labelbox as lb
import labelbox.types as lb_types
import uuid
import json

Replace with your API key

API_KEY = ""
client = lb.Client(API_KEY)

Step 1: Import data rows

To attach annotations to a data row, it must first be uploaded to Catalog. Here we create an example text data row in Catalog.

# You can now include ohter fields like attachments, media type and metadata in the data row creation step: https://docs.labelbox.com/reference/text-file   
global_key = "lorem-ipsum.txt"
text_asset = {
    "row_data": "https://storage.googleapis.com/labelbox-sample-datasets/nlp/lorem-ipsum.txt",
    "global_key": global_key,
    "media_type": "TEXT",
    "attachments": [{"type": "TEXT_URL", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/text_attachment.txt"}]
    }

dataset = client.create_dataset(name="text_annotation_import_demo_dataset")
task = dataset.create_data_rows([text_asset])
task.wait_till_done()
print("Errors:",task.errors)
print("Failed data rows:", task.failed_data_rows)

Step 2: Create/select an ontology

Your project should have the correct ontology set up with all the tools and classifications supported for your annotations, and the tool and classification name should match the name fields in your annotations to ensure the correct feature schemas are matched.

For example, when we create the checklist annotation above, we provided the name as checklist_question. Now, when we set up our ontology, we must ensure that the name of my classification tool is also checklist_question. The same alignment must hold true for the other tools and classifications we create in our ontology.

Here is an example of creating an ontology programmatically for all the sample annotations above.

## Setup the ontology and link the tools created above.

ontology_builder = lb.OntologyBuilder(
  classifications=[ # List of Classification objects
    lb.Classification( 
      class_type=lb.Classification.Type.RADIO, 
      name="radio_question", 
      options=[lb.Option(value="first_radio_answer")]
    ),
    lb.Classification( 
      class_type=lb.Classification.Type.RADIO, 
      name="nested_radio_question", 
      options=[
        lb.Option(value="first_radio_answer",
          options=[
              lb.Classification(
                class_type=lb.Classification.Type.RADIO,
                name="sub_radio_question",
                options=[
                  lb.Option(value="first_sub_radio_answer")
                ]
            ),
          ]
        ),
      ], 
    ),
     lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="nested_checklist_question",
      options=[
          lb.Option("first_checklist_answer",
            options=[
              lb.Classification(
                  class_type=lb.Classification.Type.CHECKLIST,
                  name="sub_checklist_question", 
                  options=[lb.Option("first_sub_checklist_answer")]
              )
          ]
        )
      ]
    ),
    lb.Classification( 
      class_type=lb.Classification.Type.CHECKLIST, 
      name="checklist_question", 
      options=[
        lb.Option(value="first_checklist_answer"),
        lb.Option(value="second_checklist_answer"), 
        lb.Option(value="third_checklist_answer")            
      ]
    ), 
     lb.Classification( 
      class_type=lb.Classification.Type.TEXT,
      name="free_text"
    )
  ],
  tools=[ # List of Tool objects
         lb.Tool(
            tool=lb.Tool.Type.NER, 
            name="named_entity"
          ),
         lb.Tool( 
            tool=lb.Tool.Type.RELATIONSHIP,
            name="relationship"
          )
    ]
)
ontology = client.create_ontology("Ontology Text Annotations", ontology_builder.asdict()) 

Step 3: Create a labeling project

Connect the ontology to the labeling project.

# Project defaults to batch mode with benchmark quality settings if this argument is not provided
# Queue mode will be deprecated once dataset mode is deprecated

project = client.create_project(name="text_project_demo",
                                    queue_mode=lb.QueueMode.Batch,
                                    media_type=lb.MediaType.Text)


project.setup_editor(ontology)

Step 4: Send a batch of data rows to the project

# Setup Batches and Ontology

# Create a batch to send to your MAL project
batch = project.create_batch(
  "first-batch-text-demo", # Each batch in a project must have a unique name
  global_keys=[global_key] , # a list of global keys, data rows, or data row ids
  priority=5 # priority between 1(highest) - 5(lowest)
)

print("Batch: ", batch)

Step 5: Create the annotations payload

Create the annotations payload using the snippets of code shown above.

Labelbox supports two formats for the annotations payload: NDJSON and Python annotation types. Both approaches are described below with instructions to compose annotations into Labels attached to the data rows.

The resulting labels and label_ndjson from each approach will include every annotation (created above) supported by the respective method.

labels = []
labels.append(
    lb_types.Label(
        data=lb_types.TextData(
            global_key=global_key),
        annotations = [
            named_entitity_annotation, 
            radio_annotation, 
            checklist_annotation, 
            text_annotation,
            ner_source,
            ner_target,
            ner_relationship,
            nested_checklist_annotation,
            nested_radio_annotation
        ]
    )
)
label_ndjson = []
for annotations in [entities_ndjson, 
                   radio_annotation_ndjson,  
                   checklist_annotation_ndjson,
                   text_annotation_ndjson,
                   nested_radio_annotation_ndjson,
                   nested_checklist_annotation_ndjson,
                   entity_source_ndjson,
                   entity_target_ndjson,
                   ner_relationship_annotation_ndjson,
                    ] :
  annotations.update({
      "dataRow": { "globalKey": global_key }
  })                   
  label_ndjson.append(annotations)

Step 6: Import the annotation payload

For both options, you can pass either the labels or label_ndjson payload as the value for the predictions or labels parameter.

Option A: Upload to a labeling project as pre-labels (Model-assisted labeling)

upload_job_mal = lb.MALPredictionImport.create_from_objects(
    client = client, 
    project_id = project.uid, 
    name="mal_import_job"+str(uuid.uuid4()), 
    predictions=labels)

upload_job_mal.wait_until_done();
print("Errors:", upload_job_mal.errors)
print("Status of uploads: ", upload_job_mal.statuses)

Option B: Upload to a labeling project as ground truth

πŸ“˜

Relationship annotations are not supported in label import jobs

# Upload label for this data row in project
upload_job_label_import = lb.LabelImport.create_from_objects(
    client = client, 
    project_id = project.uid, 
    name="label_import_job"+str(uuid.uuid4()),  
    labels=labels
)

upload_job_label_import.wait_until_done();
print("Errors:", upload_job_label_import.errors)
print("Status of uploads: ", upload_job_label_import.statuses)