ner数据扩充脚本例子
import json
import random
js = {"id": 10000, "text": "彭于晏,1982年3月24日出生于台湾省澎湖县,彭于晏毕业于不列颠哥伦比亚大学,加拿大籍华裔影视男演员、歌手", "labels": [[0, 3, "姓名"], [4, 14, "时间"], [24, 27, "姓名"]]}
print(js["id"])
new_file = open("new.txt", "w", encoding="utf-8")
name_list = ["DXFDXF", "DJHDJH", "DJHH"]
with open("ner.txt", "r", encoding="utf-8") as f:
to_augment_entity = []
text = js["text"]
labels = js["labels"]
labels.sort(key=lambda x: float(x[1]), reverse=False)
print("text:", text)
print("labels:", labels)
for i in range(len(labels)):
if labels[i][2] == "姓名":
to_augment_entity.append([labels[i], labels[i][1] - labels[i][0], i])
print("to_augment_entity:", to_augment_entity)
for aug in to_augment_entity:
new_dict = {}
new_labels = []
for i in range(aug[2]):
new_labels.append(labels[i])
replace_text = random.choice(name_list)
new_labels.append([aug[0][0], aug[0][0]+len(replace_text), "姓名"])
diff = len(replace_text) - (aug[0][1]-aug[0][0])
for i in range(aug[2]+1, len(labels)):
new_labels.append([labels[i][0]+diff, labels[i][1]+diff, labels[i][2]])
print("replace:", replace_text)
print("to_replace:", text[aug[0][0]:aug[0][1]])
new_text = text[:aug[0][0]] + replace_text + text[aug[0][1]:]
print("new_text:", new_text)
print("new_label:", new_labels)
new_dict["id"] = 1000
new_dict["text"] = new_text
new_dict["labels"] = new_labels
print(new_dict)
print("===========")
new_file.write(str(new_dict) + "\n")
new_file.close()
|