Import text annotations

How to import annotations on text data and sample import formats.

You can use the Python SDK to import annotations on text assets.

This page shows how to declare the annotations and demonstrates the import process.

A Python notebook demonstrates these steps and can be run directly with Google CoLab.

Supported annotations

To import annotations in Labelbox, you need to create the annotations payload. This section shows how to declare payloads for each supported annotation types. You can declare payloads as Python annotation types (preferred) or as NDJSON objects.

Entity

named_entity = lb_types.TextEntity(start=10, end=20)
named_entitity_annotation = lb_types.ObjectAnnotation(value=named_entity, name = "named_entity")
entities_ndjson = { 
    "name": "named_entity",
    "location": { 
        "start": 67, 
        "end": 128 
    }
}

Classification: radio (single choice)

radio_annotation = lb_types.ClassificationAnnotation(
    name="radio_question",
    value=lb_types.Radio(answer = 
        lb_types.ClassificationAnswer(name = "first_radio_answer")
    )
)
radio_annotation_ndjson = {
  "name": "radio_question",
  "answer": {"name": "first_radio_answer"}
}

Classification: checklist (multiple choice)

checklist_annotation = lb_types.ClassificationAnnotation(
    name="checklist_question",
    value=lb_types.Checklist(answer = [
        lb_types.ClassificationAnswer(name = "first_checklist_answer"),
        lb_types.ClassificationAnswer(name = "second_checklist_answer"),
        lb_types.ClassificationAnswer(name = "third_checklist_answer")
    ])
  )
checklist_annotation_ndjson = {
  "name": "checklist_question",
  "answer": [
    {"name": "first_checklist_answer"},
    {"name": "second_checklist_answer"},
    {"name": "third_checklist_answer"},
  ]
}

Classification: radio with nested classifications

nested_radio_annotation = lb_types.ClassificationAnnotation(
  name="nested_radio_question",
  value=lb_types.Radio(
    answer=lb_types.ClassificationAnswer(
      name="first_radio_answer",
      classifications=[
        lb_types.ClassificationAnnotation(
          name="sub_radio_question",
          value=lb_types.Radio(
            answer=lb_types.ClassificationAnswer(
              name="first_sub_radio_answer"
            )
          )
        )
      ]
    )
  )
)
nested_radio_annotation_ndjson= {
  'name': 'nested_radio_question',
  'answer': {
      'name': 'first_radio_answer',
      'classifications': [{
          'name':'sub_radio_question',
          'answer': { 'name' : 'first_sub_radio_answer'}
        }]
    }
}

Classification: checklist with nested classifications

nested_checklist_annotation = lb_types.ClassificationAnnotation(
  name="nested_checklist_question",
  value=lb_types.Checklist(
    answer=[lb_types.ClassificationAnswer(
      name="first_checklist_answer",
      classifications=[
        lb_types.ClassificationAnnotation(
          name="sub_checklist_question",
          value=lb_types.Checklist(
            answer=[lb_types.ClassificationAnswer(
            name="first_sub_checklist_answer"
          )]
        ))
      ]
    )]
  )
)
nested_checklist_annotation_ndjson = {
  "name": "nested_checklist_question",
  "answer": [{
      "name": "first_checklist_answer", 
      "classifications" : [
        {
          "name": "sub_checklist_question", 
          "answer": {"name": "first_sub_checklist_answer"}
        }          
      ]         
  }]
}

Classification: free-form text

text_annotation = lb_types.ClassificationAnnotation(
    name = "free_text", 
    value = lb_types.Text(answer="sample text")
)
text_annotation_ndjson = {
  "name": "free_text",
  "answer": "sample text",
}

Relationship with NER

Relationship annotations are only supported for model assisted labeling (MAL) import jobs.

ner_source = lb_types.ObjectAnnotation(
    name="named_entity",
    value=lb_types.TextEntity(
      start=133, 
      end=140
    )
)

ner_target = lb_types.ObjectAnnotation(
    name="named_entity",
    value=lb_types.TextEntity(
      start=143,
      end=159
    )
)

ner_relationship = lb_types.RelationshipAnnotation(
    name="relationship",
    value=lb_types.Relationship(
        source=ner_source, # UUID is not required for annotation types 
        target=ner_target, 
        type=lb_types.Relationship.Type.UNIDIRECTIONAL,
    ))
uuid_source = str(uuid.uuid4())
uuid_target = str(uuid.uuid4())

entity_source_ndjson = {
  "name": "named_entity",
  "uuid": uuid_source, 
  "location": {
          "start" : 133,
          "end": 140          
      }
}

entity_target_ndjson = {
  "name": "named_entity",
  "uuid": uuid_target,
  "location": {
    "start": 143,
    "end": 159
  }
}

ner_relationship_annotation_ndjson = {
    "name": "relationship", 
    "relationship": {
      "source": uuid_source, # UUID reference to entity source annotation
      "target": uuid_target, # UUID reference to target source annotation 
      "type": "unidirectional"
    }
}

Example: Import prelabels or ground truth

The process to import annotations as prelabels (model-assisted labeling) is very similar to the ground truth import process. They vary slightly in Steps 5 and 6, which describe the differences in detail.

Before you start

These examples require the following libraries:

import labelbox as lb
import labelbox.types as lb_types
import uuid
import json

Replace API key

API_KEY = ""
client = lb.Client(API_KEY)

Step 1: Import data rows

To attach annotations to a data row, it must first be uploaded to Catalog.

This example shows how to create a text data row in Catalog.

# You can now include ohter fields like attachments, media type and metadata in the data row creation step: https://docs.labelbox.com/reference/text-file   
global_key = "lorem-ipsum.txt"
text_asset = {
    "row_data": "https://storage.googleapis.com/labelbox-sample-datasets/nlp/lorem-ipsum.txt",
    "global_key": global_key,
    "media_type": "TEXT",
    "attachments": [{"type": "TEXT_URL", "value": "https://storage.googleapis.com/labelbox-sample-datasets/Docs/text_attachment.txt"}]
    }

dataset = client.create_dataset(name="text_annotation_import_demo_dataset")
task = dataset.create_data_rows([text_asset])
task.wait_till_done()
print("Errors:",task.errors)
print("Failed data rows:", task.failed_data_rows)

Step 2: Set up ontology

Your project ontology should support all tools and classifications required by your annotations. To ensure correct schema feature matches, the tool and classification name values should match the name field values in your annotations.

To illustrate, suppose created a checklist annotation and set its name parameter to checklist_question. When you create the checklist classification in your ontology, you need to set its name to checklist_question and follow a similar process for each classification in your annotation.

This example demonstrates this.

## Setup the ontology and link the tools created above.

ontology_builder = lb.OntologyBuilder(
  classifications=[ # List of Classification objects
    lb.Classification( 
      class_type=lb.Classification.Type.RADIO, 
      name="radio_question", 
      options=[lb.Option(value="first_radio_answer")]
    ),
    lb.Classification( 
      class_type=lb.Classification.Type.RADIO, 
      name="nested_radio_question", 
      options=[
        lb.Option(value="first_radio_answer",
          options=[
              lb.Classification(
                class_type=lb.Classification.Type.RADIO,
                name="sub_radio_question",
                options=[
                  lb.Option(value="first_sub_radio_answer")
                ]
            ),
          ]
        ),
      ], 
    ),
     lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="nested_checklist_question",
      options=[
          lb.Option("first_checklist_answer",
            options=[
              lb.Classification(
                  class_type=lb.Classification.Type.CHECKLIST,
                  name="sub_checklist_question", 
                  options=[lb.Option("first_sub_checklist_answer")]
              )
          ]
        )
      ]
    ),
    lb.Classification( 
      class_type=lb.Classification.Type.CHECKLIST, 
      name="checklist_question", 
      options=[
        lb.Option(value="first_checklist_answer"),
        lb.Option(value="second_checklist_answer"), 
        lb.Option(value="third_checklist_answer")            
      ]
    ), 
     lb.Classification( 
      class_type=lb.Classification.Type.TEXT,
      name="free_text"
    )
  ],
  tools=[ # List of Tool objects
         lb.Tool(
            tool=lb.Tool.Type.NER, 
            name="named_entity"
          ),
         lb.Tool( 
            tool=lb.Tool.Type.RELATIONSHIP,
            name="relationship"
          )
    ]
)
ontology = client.create_ontology("Ontology Text Annotations", ontology_builder.asdict()) 

Step 3: Create labeling project

Connect the ontology to the labeling project.

# Project defaults to batch mode with benchmark quality settings if this argument is not provided
# Queue mode will be deprecated once dataset mode is deprecated

project = client.create_project(name="text_project_demo",
                                    queue_mode=lb.QueueMode.Batch,
                                    media_type=lb.MediaType.Text)


project.setup_editor(ontology)

Step 4: Send data rows to project

# Set up batches and ontology

# Create a batch to send to your MAL project
batch = project.create_batch(
  "first-batch-text-demo", # Each batch in a project must have a unique name
  global_keys=[global_key] , # a list of global keys, data rows, or data row ids
  priority=5 # priority between 1(highest) - 5(lowest)
)

print("Batch: ", batch)

Step 5: Create the annotations payload

See supported annotations for hlp creating annotation payloads. You can declare annotations using Pyhon annotation types (preferred) or as NDJSON objects.

This example shows how to crate annotation payloads for each supported annotation type and describes how to compose annotations into labels attached to data rows.

labels = []
labels.append(
    lb_types.Label(
        data=lb_types.TextData(
            global_key=global_key),
        annotations = [
            named_entitity_annotation, 
            radio_annotation, 
            checklist_annotation, 
            text_annotation,
            ner_source,
            ner_target,
            ner_relationship,
            nested_checklist_annotation,
            nested_radio_annotation
        ]
    )
)
label_ndjson = []
for annotations in [entities_ndjson, 
                   radio_annotation_ndjson,  
                   checklist_annotation_ndjson,
                   text_annotation_ndjson,
                   nested_radio_annotation_ndjson,
                   nested_checklist_annotation_ndjson,
                   entity_source_ndjson,
                   entity_target_ndjson,
                   ner_relationship_annotation_ndjson,
                    ] :
  annotations.update({
      "dataRow": { "globalKey": global_key }
  })                   
  label_ndjson.append(annotations)

Step 6: Import annotation payload

For each option, pass the appropriate payload to the relevant parameter. Prelabels should pass th payload to predictions while ground truths should pass the payload to labels.

upload_job_mal = lb.MALPredictionImport.create_from_objects(
    client = client, 
    project_id = project.uid, 
    name="mal_import_job"+str(uuid.uuid4()), 
    predictions=labels)

upload_job_mal.wait_until_done();
print("Errors:", upload_job_mal.errors)
print("Status of uploads: ", upload_job_mal.statuses)

Option B: Upload as ground truth

Relationship annotations are not supported for ground truth import jobs.

# Upload label for this data row in project
upload_job_label_import = lb.LabelImport.create_from_objects(
    client = client, 
    project_id = project.uid, 
    name="label_import_job"+str(uuid.uuid4()),  
    labels=labels
)

upload_job_label_import.wait_until_done();
print("Errors:", upload_job_label_import.errors)
print("Status of uploads: ", upload_job_label_import.statuses)