Source code for ehn.parse.parser

#!/usr/bin/env python
# -*- coding:utf-8 -*-

"""
Please refer the tutorial ":ref:`tutorial-parse_parser`".
"""

# pylint: disable=invalid-name, no-self-use

__author__ = "Mu Yang <http://muyang.pro>"
__copyright__ = "2018-2021 CKIP Lab"
__license__ = "GPL-3.0"

import re
from wcwidth import wcswidth

from ply.lex import lex
from ply.yacc import yacc

from .node import (
    EhnParseAnchor,
    EhnParseAnyPlaceholder,
    EhnParseCoindexReference,
    EhnParseFunction,
    EhnParseFunctionEntity,
    EhnParseFunctionFeature,
    EhnParseNameEntity,
    EhnParseNormalEntity,
    EhnParseNormalFeature,
    EhnParseNumberEntity,
    EhnParseRestrictionPlaceholder,
    EhnParseSubject,
    EhnParseSubjectReference,
    EhnParseTildeReference,
)

################################################################################################################################
# Core
#

EHN_TOKENS_CHAR = {
    "QUOTE": '"',
    "EQUAL": "=",
    "COLON": ":",
    "COMMA": ",",
    "SLASH": "/",
    "ULINE": "_",
    "LPAREN": "(",
    "RPAREN": ")",
    "LBRACE": "{",
    "RBRACE": "}",
    "TILDE": "~",
}

EHN_TOKENS = ["TEXT", "NUMBER", "COINDEX", "COINDEX0", *EHN_TOKENS_CHAR.keys()]


[docs]class EhnSyntaxError(SyntaxError): """E-HowNet Syntax Error.""" def __init__(self, *args, pos=None): super().__init__(*args) self.pos = pos
[docs] def show_pos(self, text): """Show error position. Parameters ---------- text original input text """ return " " * wcswidth(text[: self.pos]) + "^"
################################################################################################################################ # Lexer # class _EhnLexer: def __init__(self, **kwargs): self._lexer = lex(module=self, **kwargs) tokens = EHN_TOKENS # Skip all spaces # t_ignore = ' \t\n\r\f\v' # Default state tokens t_QUOTE = r'"' t_EQUAL = r"=" t_COLON = r":" t_COMMA = r"," t_SLASH = r"/" t_ULINE = r"_" t_LPAREN = r"\(" t_RPAREN = r"\)" t_LBRACE = r"{" t_RBRACE = r"}" t_TILDE = r"~" def t_ANY_error(self, t): raise EhnSyntaxError(f"Illegal character ‘{t.value[0]}’ at position {t.lexpos}.", pos=t.lexpos) # t.lexer.skip(1) def t_TEXT(self, t): r"[A-Za-z0-9\x80-\U0010FFFF|#+\-.?]+" if _isnumber(t.value): t.type = "NUMBER" elif t.value == "x?": t.type = "COINDEX0" elif _is_coindex(t.value): t.type = "COINDEX" else: match = re.search(r"[+\-.?]", t.value) if match: pos = t.lexpos + match.start() raise EhnSyntaxError(f"Illegal character ‘{match.group(0)}’ at position {pos}.", pos=pos) return t # Invoke the lexer def __call__(self, data): self._lexer.input(data) return iter(self._lexer)
[docs]class EhnLexer(_EhnLexer): """E-HowNet Lexer. .. method:: __call__(self, data) Run tokenization. """
################################################################################################################################ # Parser # class _EhnParser: def __init__(self, lexer=None, **kwargs): if lexer is not None: assert isinstance(lexer, EhnLexer), f"{lexer} is not an EhnLexer!" self.lexer = lexer else: self.lexer = EhnLexer() self._parser = yacc(module=self, **kwargs) @property def _lexer(self): return self.lexer._lexer # pylint: disable=protected-access tokens = EHN_TOKENS # Define the parser def p_error(self, t): if t is None: msg = "Unexpected ending." pos = None else: msg = f"Unexpected symbol ‘{t.value}’ at position {t.lexpos}." pos = t.lexpos syms = [] for sym in self._parser.action[self._parser.state].keys(): sym = EHN_TOKENS_CHAR.get(sym, sym) if sym == "$end": syms.append("‘ENDING’") else: syms.append(f"‘{sym}’") if len(syms) > 1: syms[-1] = "or " + syms[-1] msg += f' Expecting a {", ".join(syms)}.' raise EhnSyntaxError(msg, pos=pos) # Object def p_expr(self, p): """expr : entity | subject""" p[0] = p[1] # Subject def p_subject(self, p): """subject : feature | subject COMMA feature""" if len(p) == 2: p[0] = EhnParseSubject(p[1]) else: p[1].add_feature(p[3]) p[0] = p[1] # Entity def p_entity_number(self, p): """entity : LBRACE NUMBER RBRACE""" p[0] = EhnParseNumberEntity(p[2]) def p_entity_name(self, p): """entity : LBRACE QUOTE TEXT QUOTE RBRACE""" p[0] = EhnParseNameEntity(p[3]) def p_entity_normal_open(self, p): """entityOpen : LBRACE TEXT""" p[0] = EhnParseNormalEntity(p[2]) def p_entity_function_open(self, p): """entityOpen : LBRACE function""" p[0] = EhnParseFunctionEntity(p[2]) def p_entity_anchor(self, p): """entityAnchor : entityOpen anchor""" p[1].anchor = p[2] p[0] = p[1] def p_entity_feature0(self, p): """entityFeature : entityOpen COLON feature | entityAnchor COLON feature""" p[1].add_feature(p[3]) p[0] = p[1] def p_entity_feature(self, p): """entityFeature : entityFeature COMMA feature""" p[1].add_feature(p[3]) p[0] = p[1] def p_entity_close(self, p): """entity : entityOpen RBRACE | entityAnchor RBRACE | entityFeature RBRACE""" p[0] = p[1] # Reference def p_reference_coindex(self, p): """reference : LBRACE COINDEX RBRACE""" p[0] = EhnParseCoindexReference(p[2]) def p_reference_subject(self, p): """reference : LBRACE COINDEX0 RBRACE""" p[0] = EhnParseSubjectReference() def p_reference_tilde(self, p): """reference : LBRACE TILDE RBRACE""" p[0] = EhnParseTildeReference() # Placeholder def p_restriction(self, p): """restriction : SLASH entity | SLASH reference""" p[0] = EhnParseRestrictionPlaceholder(p[2]) def p_restriction_anchor(self, p): """restriction : SLASH entity anchor | SLASH reference anchor""" p[0] = EhnParseRestrictionPlaceholder(p[2], anchor=p[3]) def p_any(self, p): """any : LBRACE RBRACE""" p[0] = EhnParseAnyPlaceholder() # Feature def p_feature(self, p): """feature : TEXT EQUAL entity | TEXT EQUAL reference | TEXT EQUAL restriction | TEXT EQUAL any""" p[0] = EhnParseNormalFeature(p[1], p[3]) def p_function_feature(self, p): """feature : function EQUAL entity | function EQUAL reference | function EQUAL restriction | function EQUAL any""" p[0] = EhnParseFunctionFeature(p[1], p[3]) # Function def p_function_any(self, p): """function : TEXT LPAREN RPAREN""" p[0] = EhnParseFunction(p[1], EhnParseAnyPlaceholder()) def p_function_restriction(self, p): """function : TEXT LPAREN restriction RPAREN""" p[0] = EhnParseFunction(p[1], p[3]) def p_function_open(self, p): """functionOpen : TEXT LPAREN entity | TEXT LPAREN reference""" p[0] = EhnParseFunction(p[1], p[3]) def p_function_argument(self, p): """functionArgument : functionOpen COMMA entity | functionOpen COMMA reference | functionArgument COMMA entity | functionArgument COMMA reference""" p[1].add_argument(p[3]) p[0] = p[1] def p_function_close(self, p): """function : functionOpen RPAREN | functionArgument RPAREN""" p[0] = p[1] # Anchor def p_anchor(self, p): """anchor : ULINE COINDEX""" p[0] = EhnParseAnchor(p[2]) # Invoke the parser def __call__(self, data: str, *args, debug=False, **kwargs): if debug: print(data) for tok in self.lexer(data): print(tok) ret = self._parser.parse(data, lexer=self._lexer, *args, debug=debug, **kwargs) return ret
[docs]class EhnParser(_EhnParser): """E-HowNet Parser. .. method:: __call__(self, data: str) Run parsing. """
################################################################################################################################ # Utility # def _isnumber(name): try: float(name) return True except ValueError: return False def _is_coindex(name): return _is_coindex.pattern.match(name) _is_coindex.pattern = re.compile(r"x[0-9]*")