Postingan lainnya
Buku Ini Koding!
Baru!
Buku ini akan jadi teman perjalanan kamu belajar sampai dapat kerjaan di dunia programming!
'NoneType' object is not iterable
import csv
import json
import codecs
import pandas as pd
file_path = "SINGGALANG.tsv"
df_raw = pd.read_csv(file_path,
sep="\t",
names=["token", "ne"],
skip_blank_lines=False,
quoting=csv.QUOTE_NONE,
encoding="utf-8")
# df_raw = df_raw.loc[:40]
# df_raw = df_raw.loc[:101]
# df_raw = df_raw.loc[83:101]
def token2sent(df_input):
list_sents = []
list_tokens = []
for i in df_input.itertuples():
if pd.isna(i.token) == True:
sent = " ".join(list_tokens)
list_sents.append(sent)
list_tokens = []
else:
list_tokens.append(i.token)
return list_sents
def generate_offset(str_sent):
dict_token = {}
list_char = []
for id, char in enumerate(str_sent):
if (char) == " ":
token = "".join(list_char)
dict_token[token] = {
"start_offset": id-len(token),
"end_offset": id
}
list_char = []
else:
list_char.append(char)
return dict_token
def joinlistdict(list_input):
return {'entity_id': "",
'text': " ".join([i.get('text') for i in list_input]),
'label': list_input[0].get('label'),
'start_offset': list_input[0].get('start_offset'),
'end_offset': list_input[len(list_input)-1].get('end_offset')}
def form_entities(df_input):
list_nes_sents = []
list_nes = []
list_data = []
for i in df_input.itertuples():
if pd.isna(i.ne) == True:
list_nes_sents.append(list_nes)
list_nes = []
else:
if i.ne != "O":
list_nes.append((i.token, i.ne))
list_sents = token2sent(df_input)
for id, sent in enumerate(list_sents):
list_tmp_ents = []
for ide, ele in enumerate(list_nes_sents[id]):
dict_tmp = {"entity_id": ide,
"text": ele[0],
"label": ele[1]
}
dict_tmp.update(generate_offset(sent).get(ele[0]))
list_tmp_ents.append(dict_tmp)
dict_tmp = {}
list_data.append({
"doc_id": "Singgalang-"+str(id),
"doc_text": sent,
"entities": list_tmp_ents
})
for idi, i in enumerate(list_data):
tmp_list_1 = []
tmp_list_2 = []
if len(i.get("entities")) > 1:
for idj, j in enumerate(i.get("entities")):
if idj == 0:
tmp_list_1.append(j)
elif idj == len(i.get("entities"))-1:
a = i.get("entities")[idj-1].get("end_offset")
b = j.get("start_offset")
c = i.get("entities")[idj-1].get("label")
d = j.get("label")
if b-a == 1 and c == d:
tmp_list_1.append(j)
tmp_list_2.append(tmp_list_1)
tmp_list_1 = []
else:
tmp_list_2.append(tmp_list_1)
else:
a = i.get("entities")[idj-1].get("end_offset")
b = j.get("start_offset")
c = i.get("entities")[idj-1].get("label")
d = j.get("label")
if b-a == 1 and c == d:
tmp_list_1.append(j)
else:
tmp_list_2.append(tmp_list_1)
tmp_list_1 = []
tmp_list_1.append(j)
else:
tmp_list_2.append(i.get("entities"))
list_data[idi]["entities"] = [joinlistdict(k) for k in tmp_list_2]
for idl, l in enumerate(list_data[idi]["entities"]):
list_data[idi]["entities"][idl]["entity_id"] = idl
return list_data
print("Memulai proses konversi format set data Singgalang...")
with open("singgalang.json", "wb") as f:
json.dump(form_entities(df_raw),
codecs.getwriter("utf-8")(f),
ensure_ascii=False)
print("Proses konversi format set data Singgalang selesai!")
# print(df_raw)
# data = {form_entities(df_raw)}
# df_raw = DataFrame(data)
# df_raw.to_json(
# r'C:\Users\User\Documents\GitHub\portuguese-bert\ner_evaluation\data\singgalang.json')
# app_json = json.dump(form_entities(df_raw), sort_keys=True)
# print(app_json)
0
Tanggapan
Baca aturan main di forum ini ya, sertakan kode atau gambar jika perlu
Belum ada Jawaban. Jadi yang pertama Jawaban
Login untuk ikut Jawaban