Files
HyCoRec/HyCoRec/edger/tgredial.py
Tokisakix a6349f6767 add edger
2025-08-07 09:47:36 +08:00

53 lines
1.6 KiB
Python

import json
import pickle
from tqdm import tqdm
DATA_ROOT = "data/dataset/tgredial/pkuseg"
def get_side_data():
side_data = []
with open(f"{DATA_ROOT}/cn-dbpedia.txt", "r", encoding="utf-8") as file:
for line in file.readlines():
[a, _, b] = line.split("\t")
b = b[:-1] if b[-1] == "\n" else b
side_data.append((a, b))
entity_side = {}
for a, b in tqdm(side_data):
if a in entity_side:
entity_side[a].add(b)
else:
entity_side[a] = set([b])
if b in entity_side:
entity_side[b].add(a)
else:
entity_side[b] = set([a])
for a in entity_side:
b = entity_side[a]
entity_side[a] = list(b)
word_side = {}
token_set = set([token.lower() for token in json.load(open(f"{DATA_ROOT}/token2id.json", "r", encoding="utf-8"))])
with(open("data/conceptnet/zh_side.txt", "r", encoding="utf-8")) as concept_net_words:
for words in tqdm(concept_net_words.readlines()):
a, b = words[:-1].split(" ")
if a not in token_set or b not in token_set:
continue
if a in word_side:
word_side[a].add(b)
else:
word_side[a] = set([b])
if b in word_side:
word_side[b].add(a)
else:
word_side[b] = set([a])
for a in word_side:
b = word_side[a]
word_side[a] = list(b)
return entity_side, word_side
def tgredial_edger():
item_edger, word_edger = get_side_data()
entity_edger = item_edger
return item_edger, entity_edger, word_edger