> ## Documentation Index
> Fetch the complete documentation index at: https://docs.labelbox.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Import a labeled dataset (text)

## Initialize Huggingface dataset, models and pipeline

<CodeGroup>
  ```python Python theme={null}
  datasets = load_dataset("Babelscape/wikineural")
  tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
  model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

  ## Use GPU if it exists
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
    embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')

  else:
    print(gpu_info)
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=0)
    embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens', device=0)
  ```
</CodeGroup>

## Set up client and configure parameters

<CodeGroup>
  ```python Python theme={null}
  ## Enter your API key here
  LB_API_KEY = "YOUR_API_KEY"

  client = labelbox.Client(LB_API_KEY)
  ## Set batch size for batching data rows and annotation bulk import. 500-1000 is recommended size.
  BATCH_SIZE = 500

  ## Set max number of data rows to import. WikiNeural dataset has ~1.1M data rows
  MAX_DATA_ROW_LIMIT = 2000
  ```
</CodeGroup>

## Functions

<CodeGroup>
  ```python Python theme={null}
  def create_ner_objects(class_name, st, en):
    named_enity = TextEntity(start=st,end=en)
    named_enity_annotation = ObjectAnnotation(value=named_enity, name=class_name)
    return named_enity_annotation

  def generate_predictions(datarow):
    external_id = datarow["external_id"]
    dataset_name = external_id.split("_")[0] + "_" + external_id.split("_")[1]
    datarow_index = int(external_id.split("_")[2].split(".")[0])
    uid = datarow['id']
    text_data_row =  datasets[dataset_name][datarow_index]
    tokens = text_data_row["tokens"]
    tokenized_input = tokenizer(tokens, is_split_into_words=True)
    sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True)
    annotations = []

    ## Generate prediction
    predictions = nlp(sentence)

    ## process predictions and compute text entities
    try:
      for item in predictions:
        score = item['score']
        if score > 0.99:
          entity = item['entity']
          start = item['start']
          end = item['end']
          index = predictions.index(item)

          if entity =="B-PER":
            for next_item in predictions[index+1:]:
              if next_item['entity']=="I-PER":
                end = next_item['end']
              else:
                break
            annotations.append(create_ner_objects("PER", start, end-1))

          if entity =="B-ORG":
            for next_item in predictions[index+1:]:
              if next_item['entity']=="I-ORG":
                end = next_item['end']
              else:
                break
            annotations.append(create_ner_objects("ORG", start, end-1))

          if entity =="B-LOC":
            for next_item in predictions[index+1:]:
              if next_item['entity']=="I-LOC":
                end = next_item['end']
              else:
                break
            annotations.append(create_ner_objects("LOC", start, end-1))

          if entity =="B-MISC":
            for next_item in predictions[index+1:]:
              if next_item['entity']=="I-MISC":
                end = next_item['end']
              else:
                break
            annotations.append(create_ner_objects("MISC", start, end-1))

    except Exception as e:
      print(e)

    text_data = TextData(uid=uid)
    return text_data, annotations

  def create_data_rows_payload(payload):
    data_row_content = None
    label_content = None

    try:
      h, text_data_row, lang = payload
      file_name = lang + "_" + str(h) +'.txt'

      tokens = text_data_row["tokens"]
      tokenized_input = tokenizer(tokens, is_split_into_words=True)
      sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True)

      embeddings = embedding_model.encode(sentence)
      embeddings_metadata = DataRowMetadataField(
            schema_id=embedding_field.uid,
            ## Labelbox currently only supports custom embedding of 128 max length
            value=embeddings[:128].tolist(),
          )

      language_metadata = DataRowMetadataField(
            schema_id=language_field.uid,
              value=lang,
          )
      metadata_payload = [language_metadata, embeddings_metadata]

      data_row_content = {DataRow.row_data: "gs://labelbox-datasets/wiki_neural_text_ner/"+file_name, DataRow.external_id: file_name, DataRow.metadata_fields: metadata_payload}

    except Exception as e:
      print(e)

    return data_row_content, label_content
  ```
</CodeGroup>

## Create a dataset

<CodeGroup>
  ```python Python theme={null}
  dataset = client.create_dataset(name="WikiNEuRal Text NER")
  ```
</CodeGroup>

## Set up ontology

This example requires a custom metadata string field named `language`. For help, see [Custom fields](/docs/datarow-metadata#custom-fields).

<CodeGroup>
  ```python Python theme={null}
  metadata_ontology = client.get_data_row_metadata_ontology()
  ## Create a custom metadata schema called language of string kind: /docs/datarow-metadata#custom-fields
  language_field = metadata_ontology.custom_by_name["language"]
  embedding_field = metadata_ontology.reserved_by_name["embedding"]
  ```
</CodeGroup>

<CodeGroup>
  ```python Python theme={null}
  ontology = OntologyBuilder()
  PER = Tool(tool = Tool.Type.NER, name = "PER")
  ontology.add_tool(PER)
  ORG = Tool(tool = Tool.Type.NER, name = "ORG")
  ontology.add_tool(ORG)
  LOC = Tool(tool = Tool.Type.NER, name = "LOC")
  ontology.add_tool(LOC)
  MISC = Tool(tool = Tool.Type.NER, name = "MISC")
  ontology.add_tool(MISC)

  ontology = client.create_ontology("WikiNEuRal Text NER", ontology.asdict())
  ```
</CodeGroup>

## Set up a labeling project

<CodeGroup>
  ```python Python theme={null}
  project = client.create_project(name = "WikiNEuRal Text NER", media_type=MediaType.Text)
  project.connect_ontology(ontology)
  ontology_from_project = OntologyBuilder.from_project(project)
  project.update(queue_mode=project.QueueMode.Batch)
  ```
</CodeGroup>

## Process and create data row payload in batches

<CodeGroup>
  ```python Python theme={null}
  tuples = []
  for item in datasets:
    # if item == "train_en":
    for h, text_data_row in enumerate(datasets[item]):
      tuples.append((h, text_data_row, item))

  if MAX_DATA_ROW_LIMIT !=None:
    tuples = random.sample(tuples, MAX_DATA_ROW_LIMIT)
  ```
</CodeGroup>

<CodeGroup>
  ```python Python theme={null}
  chunked_tuples = list()
  for i in range(0, len(tuples), BATCH_SIZE):
      chunked_tuples.append(tuples[i:i+BATCH_SIZE])
  ```
</CodeGroup>

## Main iterator loop to import data

<CodeGroup>
  ```python Python theme={null}
  for chunk in chunked_tuples:
    start_time = time.time()
    current_index = chunked_tuples.index(chunk)
    print("Executing {} of {} iteration".format(current_index, len(chunked_tuples)))

    ## Generate data row payload
    data_rows = []
    for item in tqdm.tqdm(chunk):
      datarow,label = create_data_rows_payload(item)
      data_rows.append(datarow)

    ## Create data rows in Labelbox
    task = dataset.create_data_rows(data_rows)
    task.wait_till_done()
    print(task)

    ## Submit a batch of the recently created data rows
    batch_datarows = []
    for item in task.result:
      batch_datarows.append(item['id'])

    batch = project.create_batch(
      str(current_index) + "_" + str(binascii.b2a_hex(os.urandom(5))), # name of the batch
      batch_datarows, # list of Data Rows
      1 # priority between 1-5
    )

    ## Generate model predictions
    ground_truth_list = LabelList()
    results = []

    for item in tqdm.tqdm(task.result):
      result = generate_predictions(item)
      ground_truth_list.append(Label(
            data=result[0],
            annotations = result[1]
        ))

    ## Convert model predictions to NDJSON format
    ground_truth_list.assign_feature_schema_ids(OntologyBuilder.from_project(project))
    ground_truth_ndjson = list(NDJsonConverter.serialize(ground_truth_list))

    ## Upload model predictions as ground truth
    upload_task = LabelImport.create_from_objects(client, project.uid, f"upload-job-{uuid4()}", ground_truth_ndjson)
    upload_task.wait_until_done()
    print(upload_task.errors)

    print(str((time.time() - start_time))+" seconds")
  ```
</CodeGroup>
