Initialize Huggingface dataset, models and pipeline

datasets = load_dataset("Babelscape/wikineural")
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

## Use GPU if it exists
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
  nlp = pipeline("ner", model=model, tokenizer=tokenizer)
  embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')

else:
  print(gpu_info)
  nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=0)
  embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens', device=0)

Setup client and configure parameters

## Enter your API key here
LB_API_KEY = "YOUR_API_KEY"

client = labelbox.Client(LB_API_KEY)
## Set batch size for batching data rows and annotation bulk import. 500-1000 is recommended size.
BATCH_SIZE = 500

## Set max number of data rows to import. WikiNeural dataset has ~1.1M data rows
MAX_DATA_ROW_LIMIT = 2000

Functions

def create_ner_objects(class_name, st, en):
  named_enity = TextEntity(start=st,end=en)
  named_enity_annotation = ObjectAnnotation(value=named_enity, name=class_name)
  return named_enity_annotation

def generate_predictions(datarow):  
  external_id = datarow["external_id"]
  dataset_name = external_id.split("_")[0] + "_" + external_id.split("_")[1]
  datarow_index = int(external_id.split("_")[2].split(".")[0])
  uid = datarow['id']
  text_data_row =  datasets[dataset_name][datarow_index]
  tokens = text_data_row["tokens"]
  tokenized_input = tokenizer(tokens, is_split_into_words=True)
  sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True)
  annotations = []

  ## Generate prediction
  predictions = nlp(sentence)

  ## process predictions and compute text entities
  try:
    for item in predictions:
      score = item['score']
      if score > 0.99:
        entity = item['entity']
        start = item['start']
        end = item['end']
        index = predictions.index(item)

        if entity =="B-PER":
          for next_item in predictions[index+1:]:
            if next_item['entity']=="I-PER":
              end = next_item['end']
            else:
              break
          annotations.append(create_ner_objects("PER", start, end-1))
          
        if entity =="B-ORG":
          for next_item in predictions[index+1:]:
            if next_item['entity']=="I-ORG":
              end = next_item['end']
            else:
              break
          annotations.append(create_ner_objects("ORG", start, end-1))

        if entity =="B-LOC":
          for next_item in predictions[index+1:]:
            if next_item['entity']=="I-LOC":
              end = next_item['end']
            else:
              break
          annotations.append(create_ner_objects("LOC", start, end-1))

        if entity =="B-MISC":
          for next_item in predictions[index+1:]:
            if next_item['entity']=="I-MISC":
              end = next_item['end']
            else:
              break
          annotations.append(create_ner_objects("MISC", start, end-1))

  except Exception as e:
    print(e)

  text_data = TextData(uid=uid)
  return text_data, annotations

def create_data_rows_payload(payload):
  data_row_content = None
  label_content = None

  try:
    h, text_data_row, lang = payload
    file_name = lang + "_" + str(h) +'.txt'
    
    tokens = text_data_row["tokens"]
    tokenized_input = tokenizer(tokens, is_split_into_words=True)
    sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True)

    embeddings = embedding_model.encode(sentence)
    embeddings_metadata = DataRowMetadataField(
          schema_id=embedding_field.uid,
          ## Labelbox currently only supports custom embedding of 128 max length
          value=embeddings[:128].tolist(),
        )
    
    language_metadata = DataRowMetadataField(
          schema_id=language_field.uid,
            value=lang,
        )
    metadata_payload = [language_metadata, embeddings_metadata]
  
    data_row_content = {DataRow.row_data: "gs://labelbox-datasets/wiki_neural_text_ner/"+file_name, DataRow.external_id: file_name, DataRow.metadata_fields: metadata_payload}

  except Exception as e:
    print(e)
  
  return data_row_content, label_content

Create a dataset

dataset = client.create_dataset(name="WikiNEuRal Text NER")

Setup ontology

📘

Create custom metadata field before proceeding

Name = language
Kind = string

Learn more

metadata_ontology = client.get_data_row_metadata_ontology()
## Create a custom metadata schema called language of string kind: https://docs.labelbox.com/docs/datarow-metadata#custom-fields
language_field = metadata_ontology.custom_by_name["language"]
embedding_field = metadata_ontology.reserved_by_name["embedding"]
ontology = OntologyBuilder()
PER = Tool(tool = Tool.Type.NER, name = "PER")             
ontology.add_tool(PER)
ORG = Tool(tool = Tool.Type.NER, name = "ORG")             
ontology.add_tool(ORG)
LOC = Tool(tool = Tool.Type.NER, name = "LOC")             
ontology.add_tool(LOC)
MISC = Tool(tool = Tool.Type.NER, name = "MISC")             
ontology.add_tool(MISC)

ontology = client.create_ontology("WikiNEuRal Text NER", ontology.asdict())

Setup a labeling project

project = client.create_project(name = "WikiNEuRal Text NER", media_type=MediaType.Text)
project.setup_editor(ontology)
ontology_from_project = OntologyBuilder.from_project(project)
project.update(queue_mode=project.QueueMode.Batch)

Process and create data row payload in batches

tuples = []
for item in datasets:
  # if item == "train_en":
  for h, text_data_row in enumerate(datasets[item]):
    tuples.append((h, text_data_row, item))

if MAX_DATA_ROW_LIMIT !=None:
  tuples = random.sample(tuples, MAX_DATA_ROW_LIMIT)
chunked_tuples = list()
for i in range(0, len(tuples), BATCH_SIZE):
    chunked_tuples.append(tuples[i:i+BATCH_SIZE])

Main iterator loop to import data

for chunk in chunked_tuples:
  start_time = time.time()
  current_index = chunked_tuples.index(chunk)
  print("Executing {} of {} iteration".format(current_index, len(chunked_tuples)))

  ## Generate data row payload
  data_rows = []
  for item in tqdm.tqdm(chunk):
    datarow,label = create_data_rows_payload(item)
    data_rows.append(datarow)

  ## Create data rows in Labelbox
  task = dataset.create_data_rows(data_rows)
  task.wait_till_done()
  print(task)

  ## Submit a batch of the recently created data rows
  batch_datarows = []
  for item in task.result:
    batch_datarows.append(item['id'])

  batch = project.create_batch(
    str(current_index) + "_" + str(binascii.b2a_hex(os.urandom(5))), # name of the batch
    batch_datarows, # list of Data Rows
    1 # priority between 1-5
  )

  ## Generate model predictions
  ground_truth_list = LabelList()
  results = []

  for item in tqdm.tqdm(task.result):
    result = generate_predictions(item)
    ground_truth_list.append(Label(
          data=result[0],
          annotations = result[1]
      ))

  ## Convert model predictions to NDJSON format
  ground_truth_list.assign_feature_schema_ids(OntologyBuilder.from_project(project))
  ground_truth_ndjson = list(NDJsonConverter.serialize(ground_truth_list))

  ## Upload model predictions as ground truth
  upload_task = LabelImport.create_from_objects(client, project.uid, f"upload-job-{uuid4()}", ground_truth_ndjson)
  upload_task.wait_until_done()
  print(upload_task.errors)

  print(str((time.time() - start_time))+" seconds")