Initialize Huggingface dataset, models and pipeline
Copy
Ask AI
datasets = load_dataset("Babelscape/wikineural")
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
## Use GPU if it exists
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
  nlp = pipeline("ner", model=model, tokenizer=tokenizer)
  embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
else:
  print(gpu_info)
  nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=0)
  embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens', device=0)
Set up client and configure parameters
Copy
Ask AI
## Enter your API key here
LB_API_KEY = "YOUR_API_KEY"
client = labelbox.Client(LB_API_KEY)
## Set batch size for batching data rows and annotation bulk import. 500-1000 is recommended size.
BATCH_SIZE = 500
## Set max number of data rows to import. WikiNeural dataset has ~1.1M data rows
MAX_DATA_ROW_LIMIT = 2000
Functions
Copy
Ask AI
def create_ner_objects(class_name, st, en):
  named_enity = TextEntity(start=st,end=en)
  named_enity_annotation = ObjectAnnotation(value=named_enity, name=class_name)
  return named_enity_annotation
def generate_predictions(datarow):
  external_id = datarow["external_id"]
  dataset_name = external_id.split("_")[0] + "_" + external_id.split("_")[1]
  datarow_index = int(external_id.split("_")[2].split(".")[0])
  uid = datarow['id']
  text_data_row =  datasets[dataset_name][datarow_index]
  tokens = text_data_row["tokens"]
  tokenized_input = tokenizer(tokens, is_split_into_words=True)
  sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True)
  annotations = []
  ## Generate prediction
  predictions = nlp(sentence)
  ## process predictions and compute text entities
  try:
    for item in predictions:
      score = item['score']
      if score > 0.99:
        entity = item['entity']
        start = item['start']
        end = item['end']
        index = predictions.index(item)
        if entity =="B-PER":
          for next_item in predictions[index+1:]:
            if next_item['entity']=="I-PER":
              end = next_item['end']
            else:
              break
          annotations.append(create_ner_objects("PER", start, end-1))
        if entity =="B-ORG":
          for next_item in predictions[index+1:]:
            if next_item['entity']=="I-ORG":
              end = next_item['end']
            else:
              break
          annotations.append(create_ner_objects("ORG", start, end-1))
        if entity =="B-LOC":
          for next_item in predictions[index+1:]:
            if next_item['entity']=="I-LOC":
              end = next_item['end']
            else:
              break
          annotations.append(create_ner_objects("LOC", start, end-1))
        if entity =="B-MISC":
          for next_item in predictions[index+1:]:
            if next_item['entity']=="I-MISC":
              end = next_item['end']
            else:
              break
          annotations.append(create_ner_objects("MISC", start, end-1))
  except Exception as e:
    print(e)
  text_data = TextData(uid=uid)
  return text_data, annotations
def create_data_rows_payload(payload):
  data_row_content = None
  label_content = None
  try:
    h, text_data_row, lang = payload
    file_name = lang + "_" + str(h) +'.txt'
    tokens = text_data_row["tokens"]
    tokenized_input = tokenizer(tokens, is_split_into_words=True)
    sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True)
    embeddings = embedding_model.encode(sentence)
    embeddings_metadata = DataRowMetadataField(
          schema_id=embedding_field.uid,
          ## Labelbox currently only supports custom embedding of 128 max length
          value=embeddings[:128].tolist(),
        )
    language_metadata = DataRowMetadataField(
          schema_id=language_field.uid,
            value=lang,
        )
    metadata_payload = [language_metadata, embeddings_metadata]
    data_row_content = {DataRow.row_data: "gs://labelbox-datasets/wiki_neural_text_ner/"+file_name, DataRow.external_id: file_name, DataRow.metadata_fields: metadata_payload}
  except Exception as e:
    print(e)
  return data_row_content, label_content
Create a dataset
Copy
Ask AI
dataset = client.create_dataset(name="WikiNEuRal Text NER")
Set up ontology
This example requires a custom metadata string field namedlanguage. For help, see Custom fields.
Copy
Ask AI
metadata_ontology = client.get_data_row_metadata_ontology()
## Create a custom metadata schema called language of string kind: /docs/datarow-metadata#custom-fields
language_field = metadata_ontology.custom_by_name["language"]
embedding_field = metadata_ontology.reserved_by_name["embedding"]
Copy
Ask AI
ontology = OntologyBuilder()
PER = Tool(tool = Tool.Type.NER, name = "PER")
ontology.add_tool(PER)
ORG = Tool(tool = Tool.Type.NER, name = "ORG")
ontology.add_tool(ORG)
LOC = Tool(tool = Tool.Type.NER, name = "LOC")
ontology.add_tool(LOC)
MISC = Tool(tool = Tool.Type.NER, name = "MISC")
ontology.add_tool(MISC)
ontology = client.create_ontology("WikiNEuRal Text NER", ontology.asdict())
Set up a labeling project
Copy
Ask AI
project = client.create_project(name = "WikiNEuRal Text NER", media_type=MediaType.Text)
project.connect_ontology(ontology)
ontology_from_project = OntologyBuilder.from_project(project)
project.update(queue_mode=project.QueueMode.Batch)
Process and create data row payload in batches
Copy
Ask AI
tuples = []
for item in datasets:
  # if item == "train_en":
  for h, text_data_row in enumerate(datasets[item]):
    tuples.append((h, text_data_row, item))
if MAX_DATA_ROW_LIMIT !=None:
  tuples = random.sample(tuples, MAX_DATA_ROW_LIMIT)
Copy
Ask AI
chunked_tuples = list()
for i in range(0, len(tuples), BATCH_SIZE):
    chunked_tuples.append(tuples[i:i+BATCH_SIZE])
Main iterator loop to import data
Copy
Ask AI
for chunk in chunked_tuples:
  start_time = time.time()
  current_index = chunked_tuples.index(chunk)
  print("Executing {} of {} iteration".format(current_index, len(chunked_tuples)))
  ## Generate data row payload
  data_rows = []
  for item in tqdm.tqdm(chunk):
    datarow,label = create_data_rows_payload(item)
    data_rows.append(datarow)
  ## Create data rows in Labelbox
  task = dataset.create_data_rows(data_rows)
  task.wait_till_done()
  print(task)
  ## Submit a batch of the recently created data rows
  batch_datarows = []
  for item in task.result:
    batch_datarows.append(item['id'])
  batch = project.create_batch(
    str(current_index) + "_" + str(binascii.b2a_hex(os.urandom(5))), # name of the batch
    batch_datarows, # list of Data Rows
    1 # priority between 1-5
  )
  ## Generate model predictions
  ground_truth_list = LabelList()
  results = []
  for item in tqdm.tqdm(task.result):
    result = generate_predictions(item)
    ground_truth_list.append(Label(
          data=result[0],
          annotations = result[1]
      ))
  ## Convert model predictions to NDJSON format
  ground_truth_list.assign_feature_schema_ids(OntologyBuilder.from_project(project))
  ground_truth_ndjson = list(NDJsonConverter.serialize(ground_truth_list))
  ## Upload model predictions as ground truth
  upload_task = LabelImport.create_from_objects(client, project.uid, f"upload-job-{uuid4()}", ground_truth_ndjson)
  upload_task.wait_until_done()
  print(upload_task.errors)
  print(str((time.time() - start_time))+" seconds")