Source code for colrev.loader.load_utils_formatter
#! /usr/bin/env python
"""Convenience functions for load formatting"""
from __future__ import annotations
import html
import re
import colrev.env.language_service
import colrev.exceptions as colrev_exceptions
from colrev.constants import Fields
from colrev.constants import FieldValues
from colrev.constants import RecordState
# pylint: disable=too-few-public-methods
[docs]class LoadFormatter:
"""Load formatter class"""
# Based on
# https://en.wikibooks.org/wiki/LaTeX/Special_Characters
_LATEX_SPECIAL_CHAR_MAPPING = {
'\\"a': "ä",
'\\"o': "ö",
'\\"u': "ü",
'\\"A': "Ä",
'\\"O': "Ö",
'\\"U': "Ü",
"\\&": "&",
"\\textendash": "–",
"\\textemdash": "—",
"\\~a": "ã",
"\\'o": "ó",
"\\emph": "",
"\\textit": "",
"\\'e": "é",
"\\`e": "è",
'"a': "ä",
'"o': "ö",
'"u': "ü",
}
_FIELDS_TO_PROCESS = [
Fields.AUTHOR,
Fields.YEAR,
Fields.TITLE,
Fields.JOURNAL,
Fields.BOOKTITLE,
Fields.SERIES,
Fields.VOLUME,
Fields.NUMBER,
Fields.PAGES,
Fields.DOI,
Fields.ABSTRACT,
]
def __init__(self) -> None:
self.language_service = colrev.env.language_service.LanguageService()
def _fix_author_particles(self, record: colrev.record.record.Record) -> None:
# Fix the name particles in the author field
if Fields.AUTHOR in record.data:
names = record.data[Fields.AUTHOR].split(" and ")
for ind, name in enumerate(names):
for prefix in [
"van den",
"von den",
"van der",
"von der",
"vom",
"van",
"von",
]:
if name.startswith(f"{prefix} "):
if "," in name:
name = "{" + name.replace(", ", "}, ")
else:
name = "{" + name + "}"
if name.endswith(f" {prefix}"):
if "," in name:
name = (
"{"
+ prefix
+ " "
+ name[: -len(prefix)].rstrip().replace(", ", "}, ")
)
else:
name = (
"{" + prefix + " " + name[: -len(prefix)].rstrip() + "}"
)
names[ind] = name
record.data[Fields.AUTHOR] = " and ".join(names)
def _format_doi(self, record: colrev.record.record.Record) -> None:
if Fields.DOI in record.data:
record.data[Fields.DOI] = (
record.data[Fields.DOI]
.lower()
.replace("https://", "http://")
.replace("dx.doi.org", "doi.org")
.replace("http://doi.org/", "")
.upper()
)
def _unify_language(self, record: colrev.record.record.Record) -> None:
if Fields.LANGUAGE in record.data and len(record.data[Fields.LANGUAGE]) != 3:
try:
self.language_service.unify_to_iso_639_3_language_codes(record=record)
except colrev_exceptions.InvalidLanguageCodeException:
del record.data[Fields.LANGUAGE]
def _rename_issue_to_number(self, record: colrev.record.record.Record) -> None:
if Fields.NUMBER not in record.data and "issue" in record.data:
record.data[Fields.NUMBER] = record.data.pop("issue")
def _apply_strict_requirements(
self, *, record: colrev.record.record.Record
) -> None:
self._fix_author_particles(record)
self._format_doi(record)
self._unify_language(record)
self._rename_issue_to_number(record)
def _unescape_latex(self, *, input_str: str) -> str:
for latex_char, repl_char in self._LATEX_SPECIAL_CHAR_MAPPING.items():
input_str = input_str.replace(f"{{{latex_char}}}", repl_char)
input_str = input_str.replace(latex_char, repl_char)
return input_str
def _unescape_html(self, *, input_str: str) -> str:
input_str = html.unescape(input_str)
if "<" in input_str:
input_str = re.sub(r"<.*?>", "", input_str)
return input_str
def _unescape_field_values(self, *, record: colrev.record.record.Record) -> None:
for field in record.data:
if field not in self._FIELDS_TO_PROCESS:
continue
record.data[field] = str(record.data[field])
if "\\" in record.data[field]:
record.data[field] = self._unescape_latex(input_str=record.data[field])
record.data[field] = self._unescape_html(input_str=record.data[field])
record.data[field] = record.data[field].replace("\n", " ").rstrip().lstrip()
def _standardize_field_values(self, *, record: colrev.record.record.Record) -> None:
if record.data.get(Fields.TITLE, FieldValues.UNKNOWN) != FieldValues.UNKNOWN:
record.data[Fields.TITLE] = re.sub(
r"\s+", " ", record.data[Fields.TITLE]
).rstrip(".")
# Fix floating point years
if Fields.YEAR in record.data and str(record.data[Fields.YEAR]).endswith(".0"):
record.data[Fields.YEAR] = str(record.data[Fields.YEAR])[:-2]
if Fields.PAGES in record.data:
record.data[Fields.PAGES] = record.data[Fields.PAGES].replace("–", "--")
if record.data[Fields.PAGES].count("-") == 1:
record.data[Fields.PAGES] = record.data[Fields.PAGES].replace("-", "--")
if record.data[Fields.PAGES].lower() == "n.pag":
del record.data[Fields.PAGES]
if record.data.get(Fields.VOLUME, "") == "ahead-of-print":
del record.data[Fields.VOLUME]
if record.data.get(Fields.NUMBER, "") == "ahead-of-print":
del record.data[Fields.NUMBER]
if Fields.URL in record.data and "login?url=https" in record.data[Fields.URL]:
record.data[Fields.URL] = record.data[Fields.URL][
record.data[Fields.URL].find("login?url=https") + 10 :
]
[docs] def run(self, record: colrev.record.record.Record) -> None:
"""Run the load formatter"""
self._apply_strict_requirements(record=record)
if (
Fields.STATUS in record.data
and record.data[Fields.STATUS] != RecordState.md_retrieved
):
return
self._unescape_field_values(record=record)
self._standardize_field_values(record=record)