Initialize Huggingface dataset, models and pipeline
datasets = load_dataset("Babelscape/wikineural")
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
## Use GPU if it exists
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
print('Not connected to a GPU')
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
else:
print(gpu_info)
nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=0)
embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens', device=0)
Set up client and configure parameters
## Enter your API key here
LB_API_KEY = "YOUR_API_KEY"
client = labelbox.Client(LB_API_KEY)
## Set batch size for batching data rows and annotation bulk import. 500-1000 is recommended size.
BATCH_SIZE = 500
## Set max number of data rows to import. WikiNeural dataset has ~1.1M data rows
MAX_DATA_ROW_LIMIT = 2000
Functions
def create_ner_objects(class_name, st, en):
named_enity = TextEntity(start=st,end=en)
named_enity_annotation = ObjectAnnotation(value=named_enity, name=class_name)
return named_enity_annotation
def generate_predictions(datarow):
external_id = datarow["external_id"]
dataset_name = external_id.split("_")[0] + "_" + external_id.split("_")[1]
datarow_index = int(external_id.split("_")[2].split(".")[0])
uid = datarow['id']
text_data_row = datasets[dataset_name][datarow_index]
tokens = text_data_row["tokens"]
tokenized_input = tokenizer(tokens, is_split_into_words=True)
sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True)
annotations = []
## Generate prediction
predictions = nlp(sentence)
## process predictions and compute text entities
try:
for item in predictions:
score = item['score']
if score > 0.99:
entity = item['entity']
start = item['start']
end = item['end']
index = predictions.index(item)
if entity =="B-PER":
for next_item in predictions[index+1:]:
if next_item['entity']=="I-PER":
end = next_item['end']
else:
break
annotations.append(create_ner_objects("PER", start, end-1))
if entity =="B-ORG":
for next_item in predictions[index+1:]:
if next_item['entity']=="I-ORG":
end = next_item['end']
else:
break
annotations.append(create_ner_objects("ORG", start, end-1))
if entity =="B-LOC":
for next_item in predictions[index+1:]:
if next_item['entity']=="I-LOC":
end = next_item['end']
else:
break
annotations.append(create_ner_objects("LOC", start, end-1))
if entity =="B-MISC":
for next_item in predictions[index+1:]:
if next_item['entity']=="I-MISC":
end = next_item['end']
else:
break
annotations.append(create_ner_objects("MISC", start, end-1))
except Exception as e:
print(e)
text_data = TextData(uid=uid)
return text_data, annotations
def create_data_rows_payload(payload):
data_row_content = None
label_content = None
try:
h, text_data_row, lang = payload
file_name = lang + "_" + str(h) +'.txt'
tokens = text_data_row["tokens"]
tokenized_input = tokenizer(tokens, is_split_into_words=True)
sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True)
embeddings = embedding_model.encode(sentence)
embeddings_metadata = DataRowMetadataField(
schema_id=embedding_field.uid,
## Labelbox currently only supports custom embedding of 128 max length
value=embeddings[:128].tolist(),
)
language_metadata = DataRowMetadataField(
schema_id=language_field.uid,
value=lang,
)
metadata_payload = [language_metadata, embeddings_metadata]
data_row_content = {DataRow.row_data: "gs://labelbox-datasets/wiki_neural_text_ner/"+file_name, DataRow.external_id: file_name, DataRow.metadata_fields: metadata_payload}
except Exception as e:
print(e)
return data_row_content, label_content
Create a dataset
dataset = client.create_dataset(name="WikiNEuRal Text NER")
Set up ontology
This example requires a custom metadata string field named language
. For help, see Custom fields.
metadata_ontology = client.get_data_row_metadata_ontology()
## Create a custom metadata schema called language of string kind: https://docs.labelbox.com/docs/datarow-metadata#custom-fields
language_field = metadata_ontology.custom_by_name["language"]
embedding_field = metadata_ontology.reserved_by_name["embedding"]
ontology = OntologyBuilder()
PER = Tool(tool = Tool.Type.NER, name = "PER")
ontology.add_tool(PER)
ORG = Tool(tool = Tool.Type.NER, name = "ORG")
ontology.add_tool(ORG)
LOC = Tool(tool = Tool.Type.NER, name = "LOC")
ontology.add_tool(LOC)
MISC = Tool(tool = Tool.Type.NER, name = "MISC")
ontology.add_tool(MISC)
ontology = client.create_ontology("WikiNEuRal Text NER", ontology.asdict())
Set up a labeling project
project = client.create_project(name = "WikiNEuRal Text NER", media_type=MediaType.Text)
project.connect_ontology(ontology)
ontology_from_project = OntologyBuilder.from_project(project)
project.update(queue_mode=project.QueueMode.Batch)
Process and create data row payload in batches
tuples = []
for item in datasets:
# if item == "train_en":
for h, text_data_row in enumerate(datasets[item]):
tuples.append((h, text_data_row, item))
if MAX_DATA_ROW_LIMIT !=None:
tuples = random.sample(tuples, MAX_DATA_ROW_LIMIT)
chunked_tuples = list()
for i in range(0, len(tuples), BATCH_SIZE):
chunked_tuples.append(tuples[i:i+BATCH_SIZE])
Main iterator loop to import data
for chunk in chunked_tuples:
start_time = time.time()
current_index = chunked_tuples.index(chunk)
print("Executing {} of {} iteration".format(current_index, len(chunked_tuples)))
## Generate data row payload
data_rows = []
for item in tqdm.tqdm(chunk):
datarow,label = create_data_rows_payload(item)
data_rows.append(datarow)
## Create data rows in Labelbox
task = dataset.create_data_rows(data_rows)
task.wait_till_done()
print(task)
## Submit a batch of the recently created data rows
batch_datarows = []
for item in task.result:
batch_datarows.append(item['id'])
batch = project.create_batch(
str(current_index) + "_" + str(binascii.b2a_hex(os.urandom(5))), # name of the batch
batch_datarows, # list of Data Rows
1 # priority between 1-5
)
## Generate model predictions
ground_truth_list = LabelList()
results = []
for item in tqdm.tqdm(task.result):
result = generate_predictions(item)
ground_truth_list.append(Label(
data=result[0],
annotations = result[1]
))
## Convert model predictions to NDJSON format
ground_truth_list.assign_feature_schema_ids(OntologyBuilder.from_project(project))
ground_truth_ndjson = list(NDJsonConverter.serialize(ground_truth_list))
## Upload model predictions as ground truth
upload_task = LabelImport.create_from_objects(client, project.uid, f"upload-job-{uuid4()}", ground_truth_ndjson)
upload_task.wait_until_done()
print(upload_task.errors)
print(str((time.time() - start_time))+" seconds")