Postingan lainnya
'NoneType' object is not iterable
import csv
import json
import codecs
import pandas as pd
file_path = "SINGGALANG.tsv"
df_raw = pd.read_csv(file_path,
sep="\t",
names=["token", "ne"],
skip_blank_lines=False,
quoting=csv.QUOTE_NONE,
encoding="utf-8")
# df_raw = df_raw.loc[:40]
# df_raw = df_raw.loc[:101]
# df_raw = df_raw.loc[83:101]
def token2sent(df_input):
list_sents = []
list_tokens = []
for i in df_input.itertuples():
if pd.isna(i.token) == True:
sent = " ".join(list_tokens)
list_sents.append(sent)
list_tokens = []
else:
list_tokens.append(i.token)
return list_sents
def generate_offset(str_sent):
dict_token = {}
list_char = []
for id, char in enumerate(str_sent):
if (char) == " ":
token = "".join(list_char)
dict_token[token] = {
"start_offset": id-len(token),
"end_offset": id
}
list_char = []
else:
list_char.append(char)
return dict_token
def joinlistdict(list_input):
return {'entity_id': "",
'text': " ".join([i.get('text') for i in list_input]),
'label': list_input[0].get('label'),
'start_offset': list_input[0].get('start_offset'),
'end_offset': list_input[len(list_input)-1].get('end_offset')}
def form_entities(df_input):
list_nes_sents = []
list_nes = []
list_data = []
for i in df_input.itertuples():
if pd.isna(i.ne) == True:
list_nes_sents.append(list_nes)
list_nes = []
else:
if i.ne != "O":
list_nes.append((i.token, i.ne))
list_sents = token2sent(df_input)
for id, sent in enumerate(list_sents):
list_tmp_ents = []
for ide, ele in enumerate(list_nes_sents[id]):
dict_tmp = {"entity_id": ide,
"text": ele[0],
"label": ele[1]
}
dict_tmp.update(generate_offset(sent).get(ele[0]))
list_tmp_ents.append(dict_tmp)
dict_tmp = {}
list_data.append({
"doc_id": "Singgalang-"+str(id),
"doc_text": sent,
"entities": list_tmp_ents
})
for idi, i in enumerate(list_data):
tmp_list_1 = []
tmp_list_2 = []
if len(i.get("entities")) > 1:
for idj, j in enumerate(i.get("entities")):
if idj == 0:
tmp_list_1.append(j)
elif idj == len(i.get("entities"))-1:
a = i.get("entities")[idj-1].get("end_offset")
b = j.get("start_offset")
c = i.get("entities")[idj-1].get("label")
d = j.get("label")
if b-a == 1 and c == d:
tmp_list_1.append(j)
tmp_list_2.append(tmp_list_1)
tmp_list_1 = []
else:
tmp_list_2.append(tmp_list_1)
else:
a = i.get("entities")[idj-1].get("end_offset")
b = j.get("start_offset")
c = i.get("entities")[idj-1].get("label")
d = j.get("label")
if b-a == 1 and c == d:
tmp_list_1.append(j)
else:
tmp_list_2.append(tmp_list_1)
tmp_list_1 = []
tmp_list_1.append(j)
else:
tmp_list_2.append(i.get("entities"))
list_data[idi]["entities"] = [joinlistdict(k) for k in tmp_list_2]
for idl, l in enumerate(list_data[idi]["entities"]):
list_data[idi]["entities"][idl]["entity_id"] = idl
return list_data
print("Memulai proses konversi format set data Singgalang...")
with open("singgalang.json", "wb") as f:
json.dump(form_entities(df_raw),
codecs.getwriter("utf-8")(f),
ensure_ascii=False)
print("Proses konversi format set data Singgalang selesai!")
# print(df_raw)
# data = {form_entities(df_raw)}
# df_raw = DataFrame(data)
# df_raw.to_json(
# r'C:\Users\User\Documents\GitHub\portuguese-bert\ner_evaluation\data\singgalang.json')
# app_json = json.dump(form_entities(df_raw), sort_keys=True)
# print(app_json)
0
Tanggapan
Baca aturan main di forum ini ya, sertakan kode atau gambar jika perlu
Belum ada Jawaban. Jadi yang pertama Jawaban
Login untuk ikut Jawaban