Source code for ehn.db.core

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
Please refer the tutorial ":ref:`tutorial-db`".
"""

__author__ = "Mu Yang <http://muyang.pro>"
__copyright__ = "2018-2021 CKIP Lab"
__license__ = "GPL-3.0"

import os
import sqlite3
import warnings

from collections import (
    defaultdict,
)

from treelib import (
    Tree,
)

from .data import (
    EhnDbNode,
    EhnDbNodeData,
    EhnDbNodeType,
    EhnDbWordData,
)

################################################################################################################################


[docs]class EhnDb: """E-HowNet Database.""" def __init__(self, *, db_file=None): assert ( db_file is not None ), "Please download the database file manually from https://ckip.iis.sinica.edu.tw/CKIP/ehownet_reg/" assert os.path.isfile(db_file), f"{db_file} is not a file!" self.tree = Tree(node_class=EhnDbNode) self.text2nid_concept = defaultdict(list) self.text2nid_word = defaultdict(list) self.text2nid_partial = defaultdict(list) # Load Database lite_db = sqlite3.connect(db_file) # pylint: disable=no-member self._load_db(lite_db.cursor()) lite_db.close() # Normalize key-mappings self.text2nid_concept = {key: list(set(value)) for key, value in self.text2nid_concept.items()} self.text2nid_word = {key: list(set(value)) for key, value in self.text2nid_word.items()} self.text2nid_partial = {key: list(set(value)) for key, value in self.text2nid_partial.items()} def get_nids(self, text, *, concept=True, word=True, full_match=False): res = [] if concept: res += self.text2nid_concept.get(text, []) if not full_match: res += self.text2nid_partial.get(text, []) if word: res += self.text2nid_word.get(text, []) return sorted(set(res)) def get_nodes(self, text, **args): nids = self.get_nids(text, **args) return list(map(self.tree.__getitem__, nids)) def _load_db(self, cursor): cid2child = defaultdict(list) cid2data = {} # Load Concept cursor.execute("SELECT `id`, `parent_id`, `label`, `defn`, `is_definite` FROM concept") for cid, pid, label, defn, definite in cursor.fetchall(): if pid is None: pid = 0 cid2child[pid].append(cid) cid2data[cid] = (label, defn, definite, pid) # Build Concept self.tree.create_node(tag="ROOT", identifier=0) def _build_concept(pid): for cid in cid2child[pid]: label, defn, definite, _ = cid2data.pop(cid) # Create node self.tree.create_node( tag=label, identifier=cid, parent=pid, data=EhnDbNodeData( type=EhnDbNodeType.C, defn=defn, definite=bool(definite), ), ) # Register key mapping self.text2nid_concept[label].append(cid) for sublabel in label.split("|"): if sublabel != label: self.text2nid_partial[sublabel].append(cid) # Recursion _build_concept(cid) _build_concept(0) # Check tree for cid, ( label, _, _, pid, ) in cid2data.items(): warnings.warn(f"Invalid parent ID #{pid}! of Concept#{cid} {label}!") # Load Word defn2words = defaultdict(list) cursor.execute("SELECT `id`, `parent_id`, `label`, `sense_no`, `defn`, `is_definite`, `is_attached` FROM word") for wid, pid, label, sense_no, defn, definite, is_attached in cursor.fetchall(): if pid not in self.tree: warnings.warn(f"Invalid parent ID #{pid}! of Word#{wid} {label}#{sense_no}!") continue if is_attached: self.tree[pid].words.append( EhnDbWordData( word=label, sense_no=sense_no, ) ) self.text2nid_word[label].append(pid) else: defn2words[pid, defn].append( ( wid, label, sense_no, definite, ) ) # Build Words for (pid, defn), words in defn2words.items(): words.sort() wid, label, _, definite0 = words[0] nid = -wid node = self.tree.create_node( tag=label, identifier=nid, parent=pid, data=EhnDbNodeData( type=EhnDbNodeType.W, defn=defn, definite=bool(definite0), ), ) for _, label, sense_no, _ in words: node.words.append( EhnDbWordData( word=label, sense_no=sense_no, ) ) self.text2nid_word[label].append(nid)