Import multimodal chat data

How to import multimodal chat data and sample import formats.

Specifications

File format: chat data JSON in conversation v2 format

Import methods:

  • Local upload (maximum character count: 2,621,440)
  • IAM Delegated Access
  • Signed URLs (https URLs only)

When importing conversation or thread data to Labelbox, include the following information for each data row in your JSON file.

ParameterRequiredDescription
row_dataYeshttps path to a cloud-hosted conversational text JSON file. See the section below for details on our conversation format.
global_keyNoUnique user-generated file name or ID for the file. Global keys are enforced to be unique in your org. Data rows will not be imported if their global keys are duplicated to existing data rows.
media_typeNo"CONVERSATIONAL" (optional media type to provide better validation and error messaging)
metadata_fieldsNoSee metadata

Import format

{
  "row_data": {
    "type": "application/vnd.labelbox.conversational.model-chat-evaluation",
    "version": 2,
    "actors": {
        "cm1qu8krf00063b72cutnbn5l": {
        "role": "human",
        "metadata": { "name": "User" }
        },
        "cm1vjleif00023b6y4fw4ew94": {
        "role": "model",
        "metadata": {
            "modelConfigName": "Gem Pro-Copy",
            "modelConfigId": "09cfe5cb-4526-424e-a3b7-3f7f16194f88"
        }
        },
        "cm1vjleif00033b6yifzroser": {
        "role": "model",
        "metadata": {
            "modelConfigName": "gpt 4-Copy",
            "modelConfigId": "a841b425-315a-488e-a510-df0b7fa76278"
        }
        }
    },
    "messages": {
        "cm1qu8krf00073b72fyar00vh": {
        "actorId": "cm1qu8krf00063b72cutnbn5l",
        "content": [{ "type": "text", "content": "Hello " }],
        "childMessageIds": [
            "cm1vjlitg00043b6y1tgssq1r",
            "cm1vjlitg00053b6y19ve1qra"
        ]
        },
        "cm1vjlitg00043b6y1tgssq1r": {
        "actorId": "cm1vjleif00023b6y4fw4ew94",
        "content": [
            {
            "type": "text",
            "content": "Hello! 👋 How can I assist you today? 😊 \\n"
            }
        ],
        "childMessageIds": []
        },
        "cm1vjlitg00053b6y19ve1qra": {
        "actorId": "cm1vjleif00033b6yifzroser",
        "content": [
            { "type": "text", "content": "Hi! How can I assist you today?" }
        ],
        "childMessageIds": []
        }
    },
    "rootMessageIds": ["cm1qu8krf00073b72fyar00vh"]
    },
  "global_key": "global_key"
}
[
    {
      "row_data": "https://storage.googleapis.com/labelbox-datasets/conversational-sample-data/pairwise_shopping_1.json",
      "global_key": "global_key_1"
    },
    {
        "row_data": "https://storage.googleapis.com/labelbox-datasets/conversational-sample-data/pairwise_shopping_2.json",
        "global_key": "global_key_2"
    },
    {
        "row_data": "https://storage.googleapis.com/labelbox-datasets/conversational-sample-data/pairwise_shopping_3.json",
        "global_key": "global_key_3"
    }
]

Python example

from labelbox import Client
import os

# Initialize the Labelbox client using an API key from environment variables.
client = Client(
    api_key=os.environ.get('LOCALHOST_API_KEY'),  # Fetches API key from environment
    endpoint="http://localhost:8080/graphql",    # Local GraphQL API endpoint for Labelbox
    rest_endpoint="http://localhost:3000/api/api/v1"  # Local REST API endpoint for Labelbox
)

# Embed the chat conversation data
row_data = {
    "type": "application/vnd.labelbox.conversational.model-chat-evaluation",
    "version": 2,
    "actors": {
        "cm1qu8krf00063b72cutnbn5l": {
            "role": "human",
            "metadata": { "name": "User" }
        },
        "cm1vjleif00023b6y4fw4ew94": {
            "role": "model",
            "metadata": {
                "modelConfigName": "Gem Pro-Copy",
                "modelConfigId": "09cfe5cb-4526-424e-a3b7-3f7f16194f88"
            }
        },
        "cm1vjleif00033b6yifzroser": {
            "role": "model",
            "metadata": {
                "modelConfigName": "gpt 4-Copy",
                "modelConfigId": "a841b425-315a-488e-a510-df0b7fa76278"
            }
        }
    },
    "messages": {
        "cm1qu8krf00073b72fyar00vh": {
            "actorId": "cm1qu8krf00063b72cutnbn5l",
            "content": [{ "type": "text", "content": "Hello " }],
            "childMessageIds": [
                "cm1vjlitg00043b6y1tgssq1r",
                "cm1vjlitg00053b6y19ve1qra"
            ]
        },
        "cm1vjlitg00043b6y1tgssq1r": {
            "actorId": "cm1vjleif00023b6y4fw4ew94",
            "content": [
                {
                    "type": "text",
                    "content": "Hello! 👋 How can I assist you today? 😊 \\n"
                }
            ],
            "childMessageIds": []
        },
        "cm1vjlitg00053b6y19ve1qra": {
            "actorId": "cm1vjleif00033b6yifzroser",
            "content": [
                { "type": "text", "content": "Hi! How can I assist you today?" }
            ],
            "childMessageIds": []
        }
    },
    "rootMessageIds": ["cm1qu8krf00073b72fyar00vh"]
}

# Create a dataset
dataset = client.create_dataset(
    name="mmc_dataset",
)

# Upload the conversation data to the dataset as a data row.
task = dataset.create_data_rows([row_data])
task.wait_till_done() 

# Output any errors that occurred during the import.
print("Errors:", task.errors)
print("Failed data rows:", task.failed_data_rows)
# Generate dummy global keys
global_key_1 = str(uuid.uuid4())
global_key_2 = str(uuid.uuid4())
global_key_3 = str(uuid.uuid4())

# Create a dataset
dataset = client.create_dataset(
    name="pairwise_demo_"+str(uuid.uuid4()),
    iam_integration=None
)
# Upload data rows
task = dataset.create_data_rows([
    {
      "row_data": "https://storage.googleapis.com/labelbox-datasets/conversational-sample-data/pairwise_shopping_1.json",
      "global_key": global_key_1
    },
    {
        "row_data": "https://storage.googleapis.com/labelbox-datasets/conversational-sample-data/pairwise_shopping_2.json",
        "global_key": global_key_2
    },
    {
        "row_data": "https://storage.googleapis.com/labelbox-datasets/conversational-sample-data/pairwise_shopping_3.json",
        "global_key": global_key_3
    }
  ])
task.wait_till_done()
print("Errors:",task.errors)
print("Failed data rows:", task.failed_data_rows)

Conversation v2 JSON

ParameterRequiredDescription
typeYesPopulate with application/vnd.labelbox.conversational.model-chat-evaluation
versionYesPopulate with 2
actorsYesAn object of actors of the chat conversation.
messagesYesAn object of messages from each actor.
rootMessageIdsYesAn array of message ids. You would include the id of first message given from a human actor.

Actor object

Actor objects start with a key value of a unique user given id.

Each actor object has a role key and a metadata key. The metadata contains the specifics of the actor and will vary depending on the actor's role.

ParameterRequiredDescription
roleYesThe role the actor receives. Either human or model.
nameNoThe name of the actor. This is applicable and required for actors with the human role. Placed inside the metadata actor key.
modelConfigNameYesThe model config name of the actor. This is required for actors with the model role. Placed inside the metadata actor key.

Message object

Message objects start with a key value of a unique user given id.

ParameterRequiredDescription
actorIdYesThe id of the actor who produced the message.
contentYesAn array of content for the message. See message content.
childMessageIdsNoAn array of message ids that are children of the message object. Typically this would be the next series of messages. If you were comparing more then one model response, multiple message ids can be included.

Message content

ParameterRequiredDescription
typeYesThe type of message. This will be fileData for attachments, text for raw text, and dataRowAttachment for attachments on data rows.
contentNoThe raw text content of your message. This field supports markdown. This field is used for text type messages.
fileUriNohttps path to a public cloud-hosted attachment file. This field is used for fileData type messages. If you want to use IAM delegated access to store conversation files, you should first add them as data row attachments. See attachments guide on how to add an attachment to a data row. After you add your attachments to your data row, you can use the type and attachmentName keys to include your attachment inside your conversational data.
attachmentNameNoThe name of the attachment on the data row.
mimeTypeNoThe mimeType of your fileUri data. The following mime types are supported.

- video/mp4
- image/png
- application/pdf

See multimodal chat evaluation guide for more information.

Sample conversation v2 JSON

📘

Information

You can't upload the following file from the web interface directly. You must use an import file as described in Import format.

    "type": "application/vnd.labelbox.conversational.model-chat-evaluation",
    "version": 2,
    "actors": {
        "actor_1": {
            "role": "human",
            "metadata": {
                "name": "User"
            }
        },
        "actor_2": {
            "role": "model",
            "metadata": {
                "modelConfigId": "3c532058-abec-41cf-9774-23a3024d3ef2",
                "modelConfigName": "Model 1"
            }
        },
        "actor_3": {
            "role": "model",
            "metadata": {
                "modelConfigId": "c45f56f3-b9c3-4756-9ded-c9f52e4f3f95",
                "modelConfigName": "Model 2"
            }
        }
    },
    "messages": {
        "clxcboi1e00053p6n0ya733nn": {
            "actorId": "actor_1",
            "content": [
                {
                    "type": "text",
                    "content": "What's in the images?"
                },
                {
                    "type": "fileData",
                    "fileUri": "https://storage.googleapis.com/labelbox-developer-testing-assets/image/giraffe-combunetes.png",
                    "mimeType": "image/png"
                },
                {
                    "type": "fileData",
                    "fileUri": "https://storage.googleapis.com/labelbox-developer-testing-assets/image/label-blocks.png",
                    "mimeType": "image/png"
                },
                {
                    "type": "fileData",
                    "fileUri": "https://storage.googleapis.com/labelbox-developer-testing-assets/image/tim-test.png",
                    "mimeType": "image/png"
                }
            ],
            "childMessageIds": [
                "clxcboue900083p6no6emql83",
                "clxcboue900093p6nrepe8jjd"
            ]
        },
        "clxcboue900083p6no6emql83": {
            "actorId": "actor_2",
            "content": [
                {
                    "type": "text",
                    "content": "The images show:\n 1. **A giraffe in an office:**  The giraffe is standing in front of a desk with a computer showing code on the screen. There's also a whiteboard with diagrams about Kubernetes."
                }
            ],
            "childMessageIds": [
                "clxmrt0hh00023p6qykkdaqtk"
            ]
        },
        "clxcboue900093p6nrepe8jjd": {
            "actorId": "actor_3",
            "content": [
                {
                    "type": "text",
                    "content": "1. The first image shows a giraffe standing in an office environment, interacting with a computer. The giraffe is looking at a large monitor displaying code, and there is a whiteboard in the background with various diagrams and notes.\n2."
                }
            ],
            "childMessageIds": [
                "clxmrt0hh00023p6qykkdaqtk"
            ]
        },
        "clxmrt0hh00023p6qykkdaqtk": {
            "actorId": "actor_1",
            "content": [
                {
                    "type": "text",
                    "content": "What's in this PDF file?"
                },
                {
                    "type": "fileData",
                    "fileUri": "https://storage.googleapis.com/labelbox-developer-testing-assets/pdf/data_files/south-platte-tree-map.pdf",
                    "mimeType": "application/pdf"
                }
            ],
            "childMessageIds": [
                "clxmrtgxg00033p6qqzl2596o",
                "clxmrtgxg00043p6qiehsvww4"
            ]
        },
        "clxmrtgxg00033p6qqzl2596o": {
            "actorId": "actor_2",
            "content": [
                {
                    "type": "text",
                    "content": "This PDF file contains directions and maps for Christmas tree cutting areas managed by the South Platte Ranger District.  Here's a breakdown:\n**Page 1: Directions**\n* Provides written directions to three Christmas tree cutting areas:\n    **Buffalo Creek:** Located southeast of Pine Junction.\n    **Camp Fickes:** Located west of Buffalo Creek.\n    **Sugar Creek:** Located along County Road\n"
                }
            ],
            "childMessageIds": [
                "clxmru9j600053p6q0qh89zm4"
            ]
        },
        "clxmrtgxg00043p6qiehsvww4": {
            "actorId": "actor_3",
            "content": [
                {
                    "type": "text",
                    "content": "Via fugit referre [duasque longumque](http://luminavale.com/) fateri sumite\ncalidumque arma spatiis fuerit genialiter errore iacent; cuncta hausit memori.\nAestus a omnia nomenque inlimis captantur ipsumque fuga. Aeneadae dona tenero\nclipei tamen, sed de amor flagellari quas; corpore, grande.\n[Pectore inclinatcadunt](http://tardoset.com/uni-et.html), Hectoreis defensatque virga altera\nsecum caliturasque militia pennas."
                }
            ],
            "childMessageIds": [
                "clxmru9j600053p6q0qh89zm4"
            ]
        },
        "clxmru9j600053p6q0qh89zm4": {
            "actorId": "actor_1",
            "content": [
                {
                    "type": "text",
                    "content": "What have astronauts brought back from the moon?"
                }
            ],
            "childMessageIds": [
                "clxmrupyh00063p6q4wxj97sz",
                "clxmrupyh00073p6qeszn06l7"
            ]
        },
        "clxmrupyh00063p6q4wxj97sz": {
            "actorId": "actor_2",
            "content": [
                {
                    "type": "text",
                    "content": "## Petebat semine\nDiurnis parsque, tanti nuper novi, extremae caede *Psophidaque spiro* dum visa.\nUsu dicebat obstet meritos."
                }
            ],
            "childMessageIds": []
        },
        "clxmrupyh00073p6qeszn06l7": {
            "actorId": "actor_3",
            "content": [
                {
                    "type": "text",
                    "content": "## Ossa custos captabat insanis humus Cipe temptatum\nLorem markdownum adflatuque est Tydides medios. Notatas te Pandrose **solent**\npartes saucius animal certamen, plures opem corpora. Est magni duce, illiarcus: Iuno atque aderat amplexo genusque."
                }
            ],
            "childMessageIds": []
        }
    },
    "rootMessageIds": [
        "clxcboi1e00053p6n0ya733nn"
    ]

📘

LaTeX support

To add LaTeX formatting, wrap your math expressions using backticks and dollar signs. The editor supports both inline and block LaTeX formatting. For example, to add LaTeX formatting for x=2, put ```$$x = 2$$```.