Tahun baru, skill baru! 🚀. Masukkan kupon "skill2025" untuk diskon 30% di kelas apa saja

'NoneType' object is not iterable

import csv
import json
import codecs
import pandas as pd

file_path = "SINGGALANG.tsv"
df_raw = pd.read_csv(file_path,
                     sep="\t",
                     names=["token", "ne"],
                     skip_blank_lines=False,
                     quoting=csv.QUOTE_NONE,
                     encoding="utf-8")
# df_raw = df_raw.loc[:40]
# df_raw = df_raw.loc[:101]
# df_raw = df_raw.loc[83:101]

def token2sent(df_input):
    list_sents = []
    list_tokens = []

    for i in df_input.itertuples():
        if pd.isna(i.token) == True:
            sent = " ".join(list_tokens)

            list_sents.append(sent)
            list_tokens = []
        else:
            list_tokens.append(i.token)
    return list_sents

def generate_offset(str_sent):
    dict_token = {}
    list_char = []
    for id, char in enumerate(str_sent):
        if (char) == " ":
            token = "".join(list_char)
            dict_token[token] = {
                "start_offset": id-len(token),
                "end_offset": id
            }
            list_char = []
        else:
            list_char.append(char)

    return dict_token

def joinlistdict(list_input):
    return {'entity_id': "",
            'text': " ".join([i.get('text') for i in list_input]),
            'label': list_input[0].get('label'),
            'start_offset': list_input[0].get('start_offset'),
            'end_offset': list_input[len(list_input)-1].get('end_offset')}

def form_entities(df_input):
    list_nes_sents = []
    list_nes = []
    list_data = []

    for i in df_input.itertuples():
        if pd.isna(i.ne) == True:
            list_nes_sents.append(list_nes)
            list_nes = []
        else:
            if i.ne != "O":
                list_nes.append((i.token, i.ne))

    list_sents = token2sent(df_input)

    for id, sent in enumerate(list_sents):

        list_tmp_ents = []
        for ide, ele in enumerate(list_nes_sents[id]):
            dict_tmp = {"entity_id": ide,
                        "text": ele[0],
                        "label": ele[1]
                        }
            dict_tmp.update(generate_offset(sent).get(ele[0]))
            list_tmp_ents.append(dict_tmp)
            dict_tmp = {}

        list_data.append({
            "doc_id": "Singgalang-"+str(id),
            "doc_text": sent,
            "entities": list_tmp_ents
        })

    for idi, i in enumerate(list_data):
        tmp_list_1 = []
        tmp_list_2 = []
        if len(i.get("entities")) > 1:
            for idj, j in enumerate(i.get("entities")):
                if idj == 0:
                    tmp_list_1.append(j)
                elif idj == len(i.get("entities"))-1:
                    a = i.get("entities")[idj-1].get("end_offset")
                    b = j.get("start_offset")

                    c = i.get("entities")[idj-1].get("label")
                    d = j.get("label")

                    if b-a == 1 and c == d:
                        tmp_list_1.append(j)
                        tmp_list_2.append(tmp_list_1)
                        tmp_list_1 = []
                    else:
                        tmp_list_2.append(tmp_list_1)
                else:
                    a = i.get("entities")[idj-1].get("end_offset")
                    b = j.get("start_offset")

                    c = i.get("entities")[idj-1].get("label")
                    d = j.get("label")

                    if b-a == 1 and c == d:
                        tmp_list_1.append(j)
                    else:
                        tmp_list_2.append(tmp_list_1)
                        tmp_list_1 = []
                        tmp_list_1.append(j)
        else:
            tmp_list_2.append(i.get("entities"))

        list_data[idi]["entities"] = [joinlistdict(k) for k in tmp_list_2]

        for idl, l in enumerate(list_data[idi]["entities"]):
            list_data[idi]["entities"][idl]["entity_id"] = idl

    return list_data

print("Memulai proses konversi format set data Singgalang...")
with open("singgalang.json", "wb") as f:
    json.dump(form_entities(df_raw),
              codecs.getwriter("utf-8")(f),
              ensure_ascii=False)
print("Proses konversi format set data Singgalang selesai!")

# print(df_raw)

# data = {form_entities(df_raw)}
# df_raw = DataFrame(data)
# df_raw.to_json(
#     r'C:\Users\User\Documents\GitHub\portuguese-bert\ner_evaluation\data\singgalang.json')

# app_json = json.dump(form_entities(df_raw), sort_keys=True)
# print(app_json)

avatar alexmarpaung
@alexmarpaung

1 Kontribusi 0 Poin

Dipost 2 tahun yang lalu

Tanggapan

Baca aturan main di forum ini ya, sertakan kode atau gambar jika perlu

Belum ada Jawaban. Jadi yang pertama Jawaban

Login untuk ikut Jawaban