Developing a parser

Development setup

Installation in editable mode with dev extras

pip install -e ".[dev]"

Skeleton

A code skeleton is available for the parser and tests:

#!/usr/bin/env python3
"""XY query parser."""
from __future__ import annotations

import re
import typing

from search_query.constants import PLATFORM
from search_query.constants import PLATFORM_FIELD_TRANSLATION_MAP
from search_query.constants import QueryErrorCode
from search_query.constants import TokenTypes
from search_query.parser_base import QueryListParser
from search_query.parser_base import QueryStringParser
from search_query.query import Query
from search_query.query import SearchField


class XYParser(QueryStringParser):
    """Parser for XY queries."""

    FIELD_TRANSLATION_MAP = PLATFORM_FIELD_TRANSLATION_MAP[PLATFORM.XY]

    PARENTHESIS_REGEX = r"[\(\)]"
    LOGIC_OPERATOR_REGEX = r"\b(AND|OR|NOT)\b"
    SEARCH_FIELD_REGEX = r"..."
    OPERATOR_REGEX = r"..."
    # ...

    pattern = "|".join(
        [
            SEARCH_FIELD_REGEX,
            OPERATOR_REGEX,
            # ...
        ]
    )

    OPERATOR_PRECEDENCE = {
        # ...
    }

    def get_precedence(self, token: str) -> int:
        """Returns operator precedence for logical and proximity operators."""

        # Change precedence or add operators
        # Implement and override methods of parent class (as needed)

    def tokenize(self) -> None:
        """Tokenize the query_str."""

        # Parse tokens and positions based on regex pattern
        for match in re.finditer(self.pattern, self.query_str):
            token = match.group()
            token = token.strip()
            start, end = match.span()

            # Determine token type
            if re.fullmatch(self.PARENTHESIS_REGEX, token):
                if token == "(":
                    token_type = TokenTypes.PARENTHESIS_OPEN
                else:
                    token_type = TokenTypes.PARENTHESIS_CLOSED
            elif re.fullmatch(self.LOGIC_OPERATOR_REGEX, token):
                token_type = TokenTypes.LOGIC_OPERATOR
            # ...
            else:
                self.add_linter_message(QueryErrorCode.TOKENIZING_FAILED, (start, end))
                continue

            self.tokens.append((token, token_type, (start, end)))

    # Override methods of parent class (as needed)

    def parse_query_tree(
        self,
        tokens: list,
        search_field: typing.Optional[SearchField] = None,
    ) -> Query:
        """Parse a query from a list of tokens."""

        # Parse a query tree from tokens (bottom-up or top-down)

        # Add messages to self.linter_messages
        # self.add_linter_message(QueryErrorCode.ADD_CODE, (start, end))

    def translate_search_fields(self, query: Query) -> None:
        """Translate search fields."""

        # Translate search fields to standard names using self.FIELD_TRANSLATION_MAP

        # Add messages to self.linter_messages if needed
        # self.add_linter_message(QueryErrorCode.ADD_CODE, (start, end))

    def parse(self) -> Query:
        """Parse a query string."""

        self.tokenize()
        self.add_artificial_parentheses_for_operator_precedence()

        query = self.parse_query_tree(self.tokens)
        self.translate_search_fields(query)

        # If self.mode == "strict", raise exception if self.linter_messages is not empty

        return query


class XYListParser(QueryListParser):
    """Parser for XY (list format) queries."""

    LIST_ITEM_REGEX = r"..."

    def __init__(self, query_list: str) -> None:
        super().__init__(query_list, XYParser)

    def get_token_str(self, token_nr: str) -> str:
        return f"#{token_nr}"

    # Override methods of parent class (as needed)

    # The parse() method of QueryListParser is called to parse the list of queries

#!/usr/bin/env python
"""Tests for search query translation"""
import json
from pathlib import Path

import pytest

from search_query.parser_base import QueryStringParser
from search_query.parser_xy import XYParser
from search_query.query import Query

# flake8: noqa: E501

# to run (from top-level dir): pytest tests/test_parser_xy.py


@pytest.mark.parametrize(
    "query_string, tokens",
    [
        (
            "AB=(Health)",
            [("AB=", (0, 3)), ("(", (3, 4)), ("Health", (4, 10)), (")", (10, 11))],
        ),
    ],
)
def test_tokenization_xy(query_string: str, tokens: tuple) -> None:
    xyparser = XYParser(query_string)
    xyparser.tokenize()
    assert xyparser.tokens == tokens, print_debug_tokens(xyparser, tokens, query_string)


def print_debug_tokens(
    xyparser: QueryStringParser, tokens: tuple, query_string
) -> None:
    print(query_string)
    print()
    print(xyparser.tokens)
    print(xyparser.get_token_types(xyparser.tokens, legend=True))
    print(tokens)


directory_path = Path("data/xy")
file_list = list(directory_path.glob("*.json"))


# Use the list of files with pytest.mark.parametrize
@pytest.mark.parametrize("file_path", file_list)
def test_xy_query_parser(file_path: str) -> None:
    """Test the translation of a search query to a xy query"""

    with open(file_path) as file:
        data = json.load(file)
        query_string = data["search_string"]
        expected = data["parsed"]["search"]

        parser = XYParser(query_string)
        query = parser.parse()
        query_str = query.to_string()
        assert query_str == expected, print_debug(  # type: ignore
            parser, query, query_string, query_str
        )


def print_debug(
    parser: QueryStringParser, query: Query, query_string: str, query_str: str
) -> None:
    print(query_string)
    print()
    print(parser.get_token_types(parser.tokens))
    print(query_str)
    print(query.to_string("structured"))


@pytest.mark.parametrize(
    "query_string, linter_messages",
    [
        (
            "AB-(Health)",
            [],
        ),
    ],
)
def test_linter_xy(query_string: str, linter_messages: tuple) -> None:
    xyparser = XYParser(query_string)
    try:
        xyparser.parse()
    except Exception:
        pass
    assert xyparser.linter_messages == linter_messages

To parse a list format, the numbered sub-queries should be replaced to create a search string, which can be parsed with the standard string-parser. This helps to avoid redundant implementation.

Tokenization

Regex

Translate search fields: Mapping Fields to Standard-Fields

The search fields supported by the database (Platform-Fields) may not necessarily match exactly with the standard fields (Standard-Fields) in constants.Fields.

We distinguish the following cases:

1:1 matches

Cases where a 1:1 match exists between DB-Fields and Standard-Fields are added to the constants.SYNTAX_FIELD_MAP.

1:n matches

Cases where a DB-Field combines multiple Standard-Fields are added to the constants.SYNTAX_COMBINED_FIELDS_MAP. For example, Pubmed offers a search for [tiab], which combines Fields.TITLE and Fields.ABSTRACT.

When parsing combined DB-Fields, the standard syntax should consist of n nodes, each with the same search term and an atomic Standard-Field. For example, Literacy[tiab] should become (Literacy[ti] OR Literacy[ab]). When serializing a database string, it is recommended to combine Standard-Fields into DB-Fields whenever possible.

n:1 matches

If multiple Database-Fields correspond to the same Standard-Field, a combination of the default Database-Field and Standard-Field are added to the constants.SYNTAX_FIELD_MAP. Non-default Database-Fields are replaced by the parser. For example, the default for MeSH terms at Pubmed is [mh], but the parser also supports [mesh].

Search Field Validation in Strict vs. Non-Strict Modes

Search Field Validation in Strict vs. Non-Strict Modes
Search-Field required	Search String	Search-Field	Mode: Strict	Mode: Non-Strict
Yes	With Search-Field	Empty	ok	ok
Yes	With Search-Field	Equal to Search-String	ok - search-field-redundant	ok
Yes	With Search-Field	Different from Search-String	error: search-field-contradiction	ok - search-field-contradiction. Parser uses Search-String per default
Yes	Without Search-Field	Empty	error: search-field-missing	ok - search-field-missing. Parser adds title as the default
Yes	Without Search-Field	Given	ok - search-field-extracted	ok
No	With Search-Field	Empty	ok	ok
No	With Search-Field	Equal to Search-String	ok - search-field-redundant	ok
No	With Search-Field	Different from Search-String	error: search-field-contradiction	ok - search-field-contradiction. Parser uses Search-String per default
No	Without Search-Field	Empty	ok - search-field-not-specified	ok - Parser uses default of database
No	Without Search-Field	Given	ok - search-field-extracted	ok

Tests

All test data should be stored in standard JSON.

Resources

Web of Science Errors