Upload LLM response evaluation

How to upload predictions on LLM response evaluation data in a model run and sample upload formats.

Supported predictions

To upload predictions in Labelbox, you need to create a prediction payload. This section shows how to create a payload for each supported prediction type.

Samples are show for the following supported supported payload formats:

  • Python annotation types (recommended)
  • NDJSON

Both are described below.

Entity (Message-based)

ner_prediction = lb_types.ObjectAnnotation(
    name="ner",
    confidence=0.5,
    value=lb_types.ConversationEntity(
        start=0,
        end=8,
        message_id="message-1"
    )
)
ner_prediction_ndjson = {
        "name": "ner",
        "confidence": 0.5,
        "location": {
            "start": 0,
            "end": 8
        },
        "messageId": "message-1"
    }

Classification: Radio (Single-choice)

radio_prediction = lb_types.ClassificationAnnotation(
    name="Choose the best response",
    value=lb_types.Radio(answer=lb_types.ClassificationAnswer(
        name="Response B", confidence=0.5)))
radio_prediction_ndjson = {
    "name": "Choose the best response",
    "answer": {
      "name": "Response B",
      "confidence": 0.5
    }
}

Classification: Checklist (Multi-choice, Message-based)

checklist_prediction= lb_types.ClassificationAnnotation(
  name="checklist_convo", # must match your ontology feature"s name
  value=lb_types.Checklist(
      answer = [
        lb_types.ClassificationAnswer(
            name = "first_checklist_answer",
            confidence=0.5
        ),
        lb_types.ClassificationAnswer(
            name = "second_checklist_answer",
            confidence=0.5
        )
      ]
    ),
  message_id="message-1" # Message specific annotation
 )
checklist_prediction_ndjson = {
    "name": "checklist_convo",
    "answers": [
        {"name": "first_checklist_answer","confidence":0.5},
        {"name": "second_checklist_answer","confidence":0.5}
    ],
    "messageId": "message-1"
}

Classification: Free-form text

text_prediction = lb_types.ClassificationAnnotation(
    name="Provide a reason for your choice",
    value=lb_types.Text(answer="the answer to the text questions right here", confidence=0.5)
)
text_prediction_ndjson = {
    "name": "Provide a reason for your choice",
    "answer": "This is the more concise answer",
    "confidence": 0.5

}

Classification: Nested checklist (Message-based)

nested_checklist_prediction = lb_types.ClassificationAnnotation(
  name="nested_checklist_question",
  message_id="message-1",
  value=lb_types.Checklist(
    answer=[lb_types.ClassificationAnswer(
      name="first_checklist_answer",
      confidence=0.5, # Confidence scores should be added to the answer
      classifications=[
        lb_types.ClassificationAnnotation(
          name="sub_checklist_question",
          value=lb_types.Checklist(
            answer=[lb_types.ClassificationAnswer(
            name="first_sub_checklist_answer",
            confidence=0.5 # Confidence scores should be added to the answer
          )]
        ))
      ]
    )]
  )
)
nested_checklist_prediction_ndjson = {
  "name": "nested_checklist_question",
  "messageId": "message-1",
  "answer": [{
      "name": "first_checklist_answer",
      "confidence": 0.5, # Confidence scores should be added to the answer
      "classifications" : [
        {
          "name": "sub_checklist_question",
          "answer": {
            "name": "first_sub_checklist_answer",
            "confidence": 0.5, # Confidence scores should be added to the answer
          }
        }
      ]
  }]
}

Classification: Nested radio

nested_radio_prediction = lb_types.ClassificationAnnotation(
  name="nested_radio_question",
  value=lb_types.Radio(
    answer=lb_types.ClassificationAnswer(
      name="first_radio_answer",
      confidence=0.5, # Confidence scores should be added to the answer
      classifications=[
        lb_types.ClassificationAnnotation(
          name="sub_radio_question",
          value=lb_types.Radio(
            answer=lb_types.ClassificationAnswer(
              name="first_sub_radio_answer",
              confidence=0.5 # Confidence scores should be added to the answer
            )
          )
        )
      ]
    )
  )
)
nested_radio_prediction_ndjson = {
  "name": "nested_radio_question",
  "answer": {
      "name": "first_radio_answer",
      "confidence": 0.5,
      "classifications": [{
          "name":"sub_radio_question",
          "answer": { "name" : "first_sub_radio_answer",
                     "confidence": 0.5}
        }]
    }
}

Example: Upload predictions to a model run

Here are the steps to upload predictions to a model run:

Before you start

You must import these libraries to use the code examples in this section.

import labelbox as lb
import uuid

Replace the value of API_KEY with a valid API key to connect to the Labelbox client.

API_KEY = None
client = lb.Client(API_KEY)

Step 1: Import data rows

Upload your data rows to Catalog. To learn how to format your import JSON files, see Import conversation & model response data.

Note the additional modelOutputs object required. For descriptions, see Model outputs object definitions.

global_key = "pairwise_shooping_asset"
convo_data =  {
        "row_data": "https://storage.googleapis.com/labelbox-datasets/conversational-sample-data/pairwise_shopping_2.json",
        "global_key": global_key
}
# Create a dataset
dataset = client.create_dataset(name="pairwise_prediction_demo")
# Create datarows
task = dataset.create_data_rows([convo_data])
print("Errors:",task.errors)
print("Failed data rows:", task.failed_data_rows)
pairwise_shopping_2 =  """
 {
  "type": "application/vnd.labelbox.conversational",
  "version": 1,
  "messages": [
    {
      "messageId": "message-0",
      "timestampUsec": 1530718491,
      "content": "Hi! How can I help?",
      "user": {
        "userId": "Bot 002",
        "name": "Bot"
      },
      "align": "left",
      "canLabel": false
    },
    {
      "messageId": "message-1",
      "timestampUsec": 1530718503,
      "content": "I just bought a vacuum cleaner from you guys like a week ago and it's already broken!!",
      "user": {
        "userId": "User 00686",
        "name": "User"
      },
      "align": "right",
      "canLabel": true
    }

  ],
  "modelOutputs": [
    {
      "title": "Response A",
      "content": "I'm really sorry to hear that your vacuum cleaner is not working as expected. We certainly understand how frustrating this can be, especially with a recent purchase. I assure you, we're here to help!\n\n To proceed with resolving this issue, could you please provide some details about the problem you're experiencing with the vacuum cleaner? Additionally, if you could share your order number or receipt details, that would be very helpful. We will definitely look into the available options for you, which may include a replacement or a refund, depending on the specific circumstances and our return policy.\n\n Your satisfaction is our priority, and we'll do our best to resolve this matter promptly and to your satisfaction.",
      "modelConfigName": "GPT-3.5 with temperature 0"
    },
    {
      "title": "Response B",
      "content": "I'm sorry about the issue with your vacuum cleaner. Please send us the order number or receipt details so we can quickly assist you with a replacement. Your satisfaction is important to us!",
      "modelConfigName": "Fine Tuned GPT-3.5 with demo data"
    }
  ]
}
"""

Step 2: Create/select an ontology for your model predictions

Your model run should have the correct ontology set up with all the tools and classifications supported for your predictions.

Here is an example of creating an ontology programmatically for all the example predictions above:

ontology_builder = lb.OntologyBuilder(
  tools=[
    lb.Tool(tool=lb.Tool.Type.NER,name="ner"),
  ],
  classifications=[
    lb.Classification(
      class_type=lb.Classification.Type.RADIO,
      scope=lb.Classification.Scope.GLOBAL,
      name="Choose the best response",
      options=[lb.Option(value="Response A"), lb.Option(value="Response B"), lb.Option(value="Tie")]
    ),
    lb.Classification(
      class_type=lb.Classification.Type.TEXT,
      name="Provide a reason for your choice"
    ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      scope=lb.Classification.Scope.INDEX,
      name="checklist_convo",
      options=[
        lb.Option(value="first_checklist_answer"),
        lb.Option(value="second_checklist_answer")
      ]
    ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="nested_checklist_question",
      scope = lb.Classification.Scope.INDEX,
      options=[
          lb.Option("first_checklist_answer",
            options=[
              lb.Classification(
                  class_type=lb.Classification.Type.CHECKLIST,
                  name="sub_checklist_question",
                  options=[lb.Option("first_sub_checklist_answer")]
              )
          ])
      ]
    ),
    lb.Classification(
        class_type=lb.Classification.Type.RADIO,
        name="nested_radio_question",
        scope = lb.Classification.Scope.GLOBAL,
        options=[
            lb.Option("first_radio_answer",
                options=[
                    lb.Classification(
                        class_type=lb.Classification.Type.RADIO,
                        name="sub_radio_question",
                        options=[lb.Option("first_sub_radio_answer")]
                    )
                ])
          ]
    )
  ]
)

ontology = client.create_ontology("Pairwise comparison ontology", ontology_builder.asdict(), media_type=lb.MediaType.Conversational)

Step 3: Create a model and a model run

Create a Model using the ontology and a model run.

# create model
model = client.create_model(name="Comparison_model_run_"+ str(uuid.uuid4()),
                            ontology_id=ontology.uid)
# create model run
model_run = model.create_model_run("iteration 1")

Step 4: Send data rows to the model run

model_run.upsert_data_rows(global_keys=[global_key])

Step 5: Create the predictions payload

Create the predictions payload using the snippets of code shown above.

Labelbox supports two formats for the annotations payload: NDJSON and Python annotation types. Both approaches are described below with instructions to compose annotations into Labels attached to the data rows.

The resulting label_ndjson and label_prediction payloads should have exactly the same prediction content.

label_prediction = []
label_prediction.append(lb_types.Label(
  data= {"global_key": global_key},
  annotations= [
    ner_prediction,
    text_prediction,
    checklist_prediction,
    radio_prediction,
    nested_radio_prediction,
    nested_checklist_prediction
  ]
))
label_ndjson = []
for annotations in [
    ner_prediction_ndjson,
    text_prediction_ndjson,
    checklist_prediction_ndjson,
    radio_prediction_ndjson,
    nested_checklist_prediction_ndjson,
    nested_radio_prediction_ndjson
    ]:
  annotations.update({
      "dataRow": {
          "globalKey": global_key
      }
  })
  label_ndjson.append(annotations)

Step 6: Upload predictions payload to the model run

# Upload the prediction label to the Model Run
upload_job_prediction = model_run.add_predictions(
    name="prediction_upload_job"+str(uuid.uuid4()),
    predictions=label_prediction)

# Errors will appear for annotation uploads that failed.
print("Errors:", upload_job_prediction.errors)
print("Status of uploads: ", upload_job_prediction.statuses)

Step 7: Send annotations to the model run (optional)

We will create a project with ground truth annotations to visualize both annotations and predictions in the model run.

To send annotations to a model run, we must first import them into a project, create a label payload, and then send them to the model run.


# 7.1 Create a labelbox project
project = client.create_project(name="Conversational Human Evaluation Demo",
                                    media_type=lb.MediaType.Conversational)
project.connect_ontology(ontology)

# 7.2 Create a batch to send to the project
project.create_batch(
  "batch_convo_prediction_demo", # Each batch in a project must have a unique name
  global_keys=[global_key], # Paginated collection of data row objects, list of data row ids or global keys
  priority=5 # priority between 1(Highest) - 5(lowest)
)

# 7.3 Create the annotations payload (
# See here for more details: 
# https://labelbox-group.readme.io/reference/import-conversational-text-annotations#supported-annotations
ner_annotation ...
text_annotation ...
checklist_annotation ...
radio_annotation ...
nested_radio_annotation ...
nested_checklist_annotation ...

# 7.4 Create the label object 
label_annotation = []
label_annotation.append(lb_types.Label(
  data= {"global_key": global_key},
  annotations= [
    ner_annotation,
    text_annotation,
    checklist_annotation,
    radio_annotation,
    nested_radio_annotation,
    nested_checklist_annotation
  ]
))

#7.5 Upload annotations to the project using label import 
upload_job_annotation = lb.LabelImport.create_from_objects(
    client = client,
    project_id = project.uid,
    name="label_import_job"+ str(uuid.uuid4()),
    labels=label_annotation)

upload_job_annotation.wait_until_done()
# Errors will appear for annotation uploads that failed.
print("Errors:", upload_job_annotation.errors)
print("Status of uploads: ", upload_job_annotation.statuses)

# 7.6 Send the annotations to the model run 
model_run.upsert_labels(project_id=project.uid)