Import a labeled dataset (text)

Initialize Huggingface dataset, models and pipeline

datasets = load_dataset("Babelscape/wikineural") tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER") model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER") ## Use GPU if it exists gpu_info = !nvidia-smi gpu_info = '\n'.join(gpu_info) if gpu_info.find('failed') >= 0: print('Not connected to a GPU') nlp = pipeline("ner", model=model, tokenizer=tokenizer) embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens') else: print(gpu_info) nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=0) embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens', device=0)

Set up client and configure parameters

## Enter your API key here LB_API_KEY = "YOUR_API_KEY" client = labelbox.Client(LB_API_KEY) ## Set batch size for batching data rows and annotation bulk import. 500-1000 is recommended size. BATCH_SIZE = 500 ## Set max number of data rows to import. WikiNeural dataset has ~1.1M data rows MAX_DATA_ROW_LIMIT = 2000

Functions

def create_ner_objects(class_name, st, en): named_enity = TextEntity(start=st,end=en) named_enity_annotation = ObjectAnnotation(value=named_enity, name=class_name) return named_enity_annotation def generate_predictions(datarow): external_id = datarow["external_id"] dataset_name = external_id.split("_")[0] + "_" + external_id.split("_")[1] datarow_index = int(external_id.split("_")[2].split(".")[0]) uid = datarow['id'] text_data_row = datasets[dataset_name][datarow_index] tokens = text_data_row["tokens"] tokenized_input = tokenizer(tokens, is_split_into_words=True) sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True) annotations = [] ## Generate prediction predictions = nlp(sentence) ## process predictions and compute text entities try: for item in predictions: score = item['score'] if score > 0.99: entity = item['entity'] start = item['start'] end = item['end'] index = predictions.index(item) if entity =="B-PER": for next_item in predictions[index+1:]: if next_item['entity']=="I-PER": end = next_item['end'] else: break annotations.append(create_ner_objects("PER", start, end-1)) if entity =="B-ORG": for next_item in predictions[index+1:]: if next_item['entity']=="I-ORG": end = next_item['end'] else: break annotations.append(create_ner_objects("ORG", start, end-1)) if entity =="B-LOC": for next_item in predictions[index+1:]: if next_item['entity']=="I-LOC": end = next_item['end'] else: break annotations.append(create_ner_objects("LOC", start, end-1)) if entity =="B-MISC": for next_item in predictions[index+1:]: if next_item['entity']=="I-MISC": end = next_item['end'] else: break annotations.append(create_ner_objects("MISC", start, end-1)) except Exception as e: print(e) text_data = TextData(uid=uid) return text_data, annotations def create_data_rows_payload(payload): data_row_content = None label_content = None try: h, text_data_row, lang = payload file_name = lang + "_" + str(h) +'.txt' tokens = text_data_row["tokens"] tokenized_input = tokenizer(tokens, is_split_into_words=True) sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True) embeddings = embedding_model.encode(sentence) embeddings_metadata = DataRowMetadataField( schema_id=embedding_field.uid, ## Labelbox currently only supports custom embedding of 128 max length value=embeddings[:128].tolist(), ) language_metadata = DataRowMetadataField( schema_id=language_field.uid, value=lang, ) metadata_payload = [language_metadata, embeddings_metadata] data_row_content = {DataRow.row_data: "gs://labelbox-datasets/wiki_neural_text_ner/"+file_name, DataRow.external_id: file_name, DataRow.metadata_fields: metadata_payload} except Exception as e: print(e) return data_row_content, label_content

Create a dataset

dataset = client.create_dataset(name="WikiNEuRal Text NER")

Set up ontology

This example requires a custom metadata string field named language. For help, see Custom fields.

metadata_ontology = client.get_data_row_metadata_ontology() ## Create a custom metadata schema called language of string kind: https://docs.labelbox.com/docs/datarow-metadata#custom-fields language_field = metadata_ontology.custom_by_name["language"] embedding_field = metadata_ontology.reserved_by_name["embedding"]
ontology = OntologyBuilder() PER = Tool(tool = Tool.Type.NER, name = "PER") ontology.add_tool(PER) ORG = Tool(tool = Tool.Type.NER, name = "ORG") ontology.add_tool(ORG) LOC = Tool(tool = Tool.Type.NER, name = "LOC") ontology.add_tool(LOC) MISC = Tool(tool = Tool.Type.NER, name = "MISC") ontology.add_tool(MISC) ontology = client.create_ontology("WikiNEuRal Text NER", ontology.asdict())

Set up a labeling project

project = client.create_project(name = "WikiNEuRal Text NER", media_type=MediaType.Text) project.connect_ontology(ontology) ontology_from_project = OntologyBuilder.from_project(project) project.update(queue_mode=project.QueueMode.Batch)

Process and create data row payload in batches

tuples = [] for item in datasets: # if item == "train_en": for h, text_data_row in enumerate(datasets[item]): tuples.append((h, text_data_row, item)) if MAX_DATA_ROW_LIMIT !=None: tuples = random.sample(tuples, MAX_DATA_ROW_LIMIT)
chunked_tuples = list() for i in range(0, len(tuples), BATCH_SIZE): chunked_tuples.append(tuples[i:i+BATCH_SIZE])

Main iterator loop to import data

for chunk in chunked_tuples: start_time = time.time() current_index = chunked_tuples.index(chunk) print("Executing {} of {} iteration".format(current_index, len(chunked_tuples))) ## Generate data row payload data_rows = [] for item in tqdm.tqdm(chunk): datarow,label = create_data_rows_payload(item) data_rows.append(datarow) ## Create data rows in Labelbox task = dataset.create_data_rows(data_rows) task.wait_till_done() print(task) ## Submit a batch of the recently created data rows batch_datarows = [] for item in task.result: batch_datarows.append(item['id']) batch = project.create_batch( str(current_index) + "_" + str(binascii.b2a_hex(os.urandom(5))), # name of the batch batch_datarows, # list of Data Rows 1 # priority between 1-5 ) ## Generate model predictions ground_truth_list = LabelList() results = [] for item in tqdm.tqdm(task.result): result = generate_predictions(item) ground_truth_list.append(Label( data=result[0], annotations = result[1] )) ## Convert model predictions to NDJSON format ground_truth_list.assign_feature_schema_ids(OntologyBuilder.from_project(project)) ground_truth_ndjson = list(NDJsonConverter.serialize(ground_truth_list)) ## Upload model predictions as ground truth upload_task = LabelImport.create_from_objects(client, project.uid, f"upload-job-{uuid4()}", ground_truth_ndjson) upload_task.wait_until_done() print(upload_task.errors) print(str((time.time() - start_time))+" seconds")