How-to guide on exporting Labelbox data to a CSV or Pandas-friendly format.
Creating a CVS file for your Labelbox data can be difficult, especially if you want to include information on the annotations associated with your label. This guide will provide examples and show the process to get your Labelbox export to a CSV or Pandas friendly format.
Information
This guide will assume you have a basic understanding of Python data structures and interacting with Labelbox exports.
Before you start
The below imports are needed to use the code examples in this section.
import labelbox as lb
import labelbox.types as lb_types
import uuid
from pprint import pprint
import csv
import pandas as pd
Replace the value of API_KEY
with a valid API key to connect to the Labelbox client.
API_KEY = None
client = lb.Client(API_KEY)
Create or select example project
The below steps will set up a project that can be used for this demo. Please feel free to delete the code block below and uncomment the code block that fetches your own project directly. For more information on this setup, visit our quickstart guide.
Create Project
# Create dataset with image data row
global_key = str(uuid.uuid4())
test_img_url = {
"row_data":
"https://storage.googleapis.com/labelbox-datasets/image_sample_data/2560px-Kitano_Street_Kobe01s5s4110.jpeg",
"global_key":
global_key,
}
dataset = client.create_dataset(name="image-demo-dataset")
task = dataset.create_data_rows([test_img_url])
task.wait_till_done()
print("Errors:", task.errors)
print("Failed data rows:", task.failed_data_rows)
# Create ontology
ontology_builder = lb.OntologyBuilder(
classifications=[ # List of Classification objects
lb.Classification(
class_type=lb.Classification.Type.RADIO,
name="radio_question",
options=[
lb.Option(value="first_radio_answer"),
lb.Option(value="second_radio_answer"),
],
),
lb.Classification(
class_type=lb.Classification.Type.CHECKLIST,
name="checklist_question",
options=[
lb.Option(value="first_checklist_answer"),
lb.Option(value="second_checklist_answer"),
],
),
lb.Classification(class_type=lb.Classification.Type.TEXT,
name="free_text"),
lb.Classification(
class_type=lb.Classification.Type.RADIO,
name="nested_radio_question",
options=[
lb.Option(
"first_radio_answer",
options=[
lb.Classification(
class_type=lb.Classification.Type.RADIO,
name="sub_radio_question",
options=[lb.Option("first_sub_radio_answer")],
)
],
)
],
),
],
tools=[ # List of Tool objects
lb.Tool(tool=lb.Tool.Type.BBOX, name="bounding_box"),
lb.Tool(
tool=lb.Tool.Type.BBOX,
name="bbox_with_radio_subclass",
classifications=[
lb.Classification(
class_type=lb.Classification.Type.RADIO,
name="sub_radio_question",
options=[lb.Option(value="tool_first_sub_radio_answer")],
),
],
),
],
)
ontology = client.create_ontology(
"Image CSV Demo Ontology",
ontology_builder.asdict(),
media_type=lb.MediaType.Image,
)
# Set up project and connect ontology
project = client.create_project(name="Image Annotation Import Demo",
media_type=lb.MediaType.Image)
project.connect_ontology(ontology)
# Send data row towards our project
batch = project.create_batch(
"image-demo-batch",
global_keys=[
global_key
], # paginated collection of data row objects, list of data row ids or global keys
priority=1,
)
print(f"Batch: {batch}")
# Create a label and imported it towards our project
radio_annotation = lb_types.ClassificationAnnotation(
name="radio_question",
value=lb_types.Radio(answer=lb_types.ClassificationAnswer(
name="second_radio_answer")),
)
checklist_annotation = lb_types.ClassificationAnnotation(
name="checklist_question",
value=lb_types.Checklist(answer=[
lb_types.ClassificationAnswer(name="first_checklist_answer"),
lb_types.ClassificationAnswer(name="second_checklist_answer"),
]),
)
text_annotation = lb_types.ClassificationAnnotation(
name="free_text",
value=lb_types.Text(answer="sample text"),
)
nested_radio_annotation = lb_types.ClassificationAnnotation(
name="nested_radio_question",
value=lb_types.Radio(answer=lb_types.ClassificationAnswer(
name="first_radio_answer",
classifications=[
lb_types.ClassificationAnnotation(
name="sub_radio_question",
value=lb_types.Radio(answer=lb_types.ClassificationAnswer(
name="first_sub_radio_answer")),
)
],
)),
)
bbox_annotation = lb_types.ObjectAnnotation(
name="bounding_box",
value=lb_types.Rectangle(
start=lb_types.Point(x=1690, y=977),
end=lb_types.Point(x=1915, y=1307),
),
)
bbox_with_radio_subclass_annotation = lb_types.ObjectAnnotation(
name="bbox_with_radio_subclass",
value=lb_types.Rectangle(
start=lb_types.Point(x=541, y=933), # x = left, y = top
end=lb_types.Point(x=871, y=1124), # x= left + width , y = top + height
),
classifications=[
lb_types.ClassificationAnnotation(
name="sub_radio_question",
value=lb_types.Radio(answer=lb_types.ClassificationAnswer(
name="tool_first_sub_radio_answer")),
)
],
)
label = []
annotations = [
radio_annotation,
nested_radio_annotation,
checklist_annotation,
text_annotation,
bbox_annotation,
bbox_with_radio_subclass_annotation,
]
label.append(
lb_types.Label(data={"global_key": global_key}, annotations=annotations))
upload_job = lb.LabelImport.create_from_objects(
client=client,
project_id=project.uid,
name="label_import_job" + str(uuid.uuid4()),
labels=label,
)
upload_job.wait_until_done()
print("Errors:", upload_job.errors)
print("Status of uploads: ", upload_job.statuses)
Select project
# PROJECT_ID = None
# project = client.get_project(PROJECT_ID)
CSV format overview
To convert our Labelbox JSON data to a format more CSV-friendly, we must first define the needed structure of our JSON. A common format that is versatile for both the built-in Python CSV writer and Pandas is as follows:
[
{"<column_1>":"<answer_1", "<column_2>":"<answer_2" ..},
{"<column_1>":"<answer_1", "<column_2>":"<answer_2" ..},
..
]
Essentially, we need to get our JSON data towards a list of Python dictionaries, with each Python dictionary representing one row, each key representing a column, and each value being an individual cell of our CSV table. Once we have our data in this format, creating Pandas DataFrames or writing our CSV file is trivial. The tricky part is getting Labelbox to export JSON to this format.
Labelbox JSON format
Labelbox JSON format is centralized at the individual data row of your export. This format allows expandability when things evolve and provides a centralized view of fields such as metadata or data row details. The main labels are located inside the project key and can be nested, making it difficult to parse. For complete samples of our project export format, visit our export overview page.
To get Labelbox export JSON format to our CSV format, we established we must do the following:
- Establish our base data row columns (project_id, data_row_id, global_key etc)
- Create our columns for label fields (label detail and annotations we care about)
- Define our functions and strategy used to parse through our data
- Setting up our main data row handler function
- Export our data
- Convert to our desired format
Step 1: Establish our base columns
We first establish our base columns that represent individual data row details. Typically, this column's information can be received from within one or two levels of a Labelbox export per data row.
Please modify the below columns if you want to include more. You must update the code later in this guide to pick up any additional columns.
data_row_base_columns = [
"Data Row ID",
"Global Key",
"External ID",
"Project ID",
]
Step 2: Create our columns for label fields
In this step, we define the label details base columns we want to include in our CSV. In this case, we will use the following:
label_base_columns = ["Label ID", "Created By", "Skipped"]
We then must establish the annotations we want to include in our columns. The order of our list matters since that is the order in which our columns will be presented. You can approach getting the annotations in a list in a number of ways, including hard defining the columns. We will be mapping between feature_schema_ids and our column name. The reason for introducing this mapping is the annotation name can be the same in certain situations, but feature_schema_ids are completely unique. This also allows you to change the column names to something other than what is included in the ontology. In the code below, I will be recursively going through the ontology we created to get our feature_schema_ids and column names based on the names of the features. In the next step of this guide, we will provide more information on recursion in the context of parsing through JSON or Python dictionaries.
def get_classification_features(classifications: list, class_list=[]) -> None:
"""Finds classification features inside an ontology recursively and returns them in a list"""
for classification in classifications:
if "name" in classification:
class_list.append({
"feature_schema_id": classification["featureSchemaId"],
"column_name": classification["instructions"],
})
if "options" in classification:
get_classification_features(classification["options"], class_list)
return class_list
def get_tool_features(tools: list) -> None:
"""Creates list of tool names from ontology"""
tool_list = []
for tool in tools:
tool_list.append({
"feature_schema_id": tool["featureSchemaId"],
"column_name": tool["name"],
})
if "classifications" in tool:
tool_list = get_classification_features(tool["classifications"],
tool_list)
return tool_list
# Get ontology from project and normalized towards python dictionary
ontology = project.ontology().normalized
class_annotation_columns = get_classification_features(
ontology["classifications"])
tool_annotation_columns = get_tool_features(ontology["tools"])
Step 3: Define our functions and strategy used to parse through our data
Now that our columns are defined, we must develop a strategy for navigating our export data. Review this sample export to follow along. While creating our columns, it is always best to first check if a key exists in your data row before populating a column. This is especially important for optional fields. In this demo, we will populate the value None
for anything absent, resulting in a blank cell in our CSV.
Data row detail base columns
The data row details can be accessed within a depth of one or two keys. Below is a function we will use to access the columns we defined. The parameters are the data row, the dictionary row used to make our list, and our base columns list.
def get_base_data_row_columns(data_row: dict[str:str], csv_row: dict[str:str],
base_columns: list[str]) -> dict[str:str]:
for base_column in base_columns:
if base_column == "Data Row ID":
csv_row[base_column] = data_row["data_row"]["id"]
elif base_column == "Global Key":
if ("global_key"
in data_row["data_row"]): # Check if global key exists
csv_row[base_column] = data_row["data_row"]["global_key"]
else:
csv_row[base_column] = (
None # If global key does not exist on data row set cell to None. This will create a blank cell on your csv
)
elif base_column == "External ID":
if ("external_id"
in data_row["data_row"]): # Check if external_id exists
csv_row[base_column] = data_row["data_row"]["external_id"]
else:
csv_row[base_column] = (
None # If external id does not exist on data row set cell to None. This will create a blank cell on your csv
)
elif base_column == "Project ID":
csv_row[base_column] = project.uid
return csv_row
Label detail base columns
The label details are similar to data row details but exist at our export's label level. Later in the guide we will demonstrate how to get our exported data row at this level. The function below shows the process of obtaining the details we defined above. The parameters are the label, the dictionary row we will modify, and the label detail column list we created.
def get_base_label_columns(label: dict[str:str], csv_row: dict[str:str],
label_base_columns: list[str]) -> dict[str:str]:
for label_base_column in label_base_columns:
if label_base_column == "Label ID":
csv_row[label_base_column] = label["id"]
elif label_base_column == "Created By":
if (
"label_details" in label
): # Check if label details is present. This field can be omitted in export.
csv_row[label_base_column] = label[
"label_details"]["created_by"]
else:
csv_row[label_base_column] = None
elif label_base_column == "Skipped":
if (
"performance_details" in label
): # Check if performance details are present. This field can be omitted in export.
csv_row[label_base_column] = label["performance_details"][
"skipped"]
else:
csv_row[label_base_column] = None
return csv_row
Label annotation columns
The label annotations are the final columns we will need to obtain. Obtaining these fields is more challenging than our approach for our detail columns. Suppose we attempt to obtain the fields with conditional statements and hard-defined paths. In that case, we will run into issues as each label can have annotations in different orders, at different depths, or not present. This will quickly create a mess, especially when we want our methods to work for multiple ontology. The best and cleanest way of obtaining these annotations inside our export data is through a recursive function.
Recursion
A recursive function can be defined as a routine that calls itself directly or indirectly. They solve problems by solving smaller instances of the same problem. This technique is commonly used in programming to solve problems that can be broken down into simpler, similar subproblems. Our sub-problem, in this case, is obtaining each individual annotation. A recursive function is divided into two components:
-
Base case: This is our termination condition that prevents the function from calling itself indefinitely.
-
Recursive case: The function calls itself with the modified arguments in the recursive case. The recursive case should move closer to the base case with each iteration.
For our example, our base case will be either the annotation exists on the label (return the value/answer), or it does not (return None
). Our recursive case would be finding more classifications to parse.
In the code block below, I will highlight a few important details about our function. Essentially, we will be navigating through our JSON file by moving one classification key at a time until we find our annotation or, if everything has been searched, returning None
, which will populate a blank cell on our CSV table.
Tools
Tools are not nested, but they can have nested classifications we will use or get_feature_answers
function below to find the nested classification. Since tools are at the base level of a label and each tool has a different value key name, we will only be searching for bounding boxes for this tutorial. If you want to include other tools, reference our export guide for your data type and find the appropriate key to add on.
def get_feature_answers(feature: str,
annotations: list[dict[str:str]]) -> None | str:
"""Returns answer of feature provided by navigating through a label's annotation list. Will return None if answer is not found.
Args:
feature (str): feature we are searching
classifications (list[dict[str:str]]): annotation list that we will be searching for our feature with.
Returns:
None | str: The answer/value of the feature returns None if nothing is found
"""
for annotation in annotations:
print(annotation)
if (annotation["feature_schema_id"] == feature["feature_schema_id"]
): # Base conditions (found feature)
if "text_answer" in annotation:
return annotation["text_answer"]["content"]
if "radio_answer" in annotation:
return annotation["radio_answer"]["value"]
if "checklist_answers" in annotation:
# Since classifications can have more then one answer. This is set up to combine all classifications separated by a comma. Feel free to modify.
return ", ".join([
check_list_ans["value"]
for check_list_ans in annotation["checklist_answers"]
])
if "bounding_box" in annotation:
return annotation["bounding_box"]
# Add more tools here with similar pattern as above
# Recursion cases (found more classifications to search through)
if "radio_answer" in annotation:
if len(annotation["radio_answer"]["classifications"]) > 0:
value = get_feature_answers(
feature, annotation["radio_answer"]["classifications"]
) # Call function again return value if answer found
if value:
return value
if "checklist_answers" in annotation:
for checklist_ans in annotation["checklist_answers"]:
if len(checklist_ans["classifications"]) > 0:
value = get_feature_answers(
feature, checklist_ans["classifications"])
if value:
return value
if ("classifications"
in annotation): # case for if tool has classifications
if len(annotation["classifications"]) > 0:
value = get_feature_answers(feature,
annotation["classifications"])
if value:
return value
return None # Base case if searched through classifications and nothing was found (end of JSON). This can be omitted but included to visualize
Step 4: Setting up our main data row handler function
Before exporting, we need to set up our main data row handler. This function will be fed straight into our export. This function will put everything together and connect all the pieces. We will also be defining our global dictionary list that will be used to create our CSVs. The output parameter represents each data row.
GLOBAL_CSV_LIST = []
def main(output: lb.BufferedJsonConverterOutput):
# Navigate to our label list
labels = output.json["projects"][project.uid]["labels"]
for label in labels:
# Define our CSV "row"
csv_row = dict()
# Start with data row base columns
csv_row = get_base_data_row_columns(output.json, csv_row,
data_row_base_columns)
# Add our label details
csv_row = get_base_label_columns(label, csv_row, label_base_columns)
# Add classification features
for classification in class_annotation_columns:
csv_row[classification["column_name"]] = get_feature_answers(
classification, label["annotations"]["classifications"])
# Add tools features
for tool in tool_annotation_columns:
csv_row[tool["column_name"]] = get_feature_answers(
tool, label["annotations"]["objects"])
# Append to global csv list
GLOBAL_CSV_LIST.append(csv_row)
Step 5: Export our data
We are ready to export now that we have defined functions and strategies. Below, we export directly from our project and feed in the main function we created above.
# Params required to obtain all fields we need
params = {"performance_details": True, "label_details": True}
export_task = project.export(params=params)
export_task.wait_till_done()
# Conditional for if export task has errors
if export_task.has_errors():
export_task.get_buffered_stream(stream_type=lb.StreamType.ERRORS).start(
stream_handler=lambda error: print(error))
if export_task.has_result():
export_json = export_task.get_buffered_stream(
stream_type=lb.StreamType.RESULT
).start(
stream_handler=main # Feeding our data row handler directly into export
)
If everything went through correctly, you should see your GLOBAL_CSV_LIST
printed out below with all your "rows" filled out.
pprint(GLOBAL_CSV_LIST)
Step 6: Convert to our desired format
The hard part is now completed!🚀 Now that you have your export in a flattened format, you can easily convert to a CSV or a Pandas DataFrame!
Option A: CSV writer
with open("file.csv", "w", newline="") as csvfile:
# Columns
fieldnames = (data_row_base_columns + label_base_columns +
[name["column_name"] for name in class_annotation_columns] +
[name["column_name"] for name in tool_annotation_columns])
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in GLOBAL_CSV_LIST:
writer.writerow(row)
Option B: Pandas DataFrame
columns = (data_row_base_columns + label_base_columns +
[name["column_name"] for name in class_annotation_columns] +
[name["column_name"] for name in tool_annotation_columns])
pd.DataFrame(GLOBAL_CSV_LIST, columns=columns)