"""The Language class."""
from __future__ import annotations
import datetime
from dataclasses import dataclass
from typing import Dict, List, NoReturn, Optional, Union, Set
from ._data import (
_PART3_TO_CODES,
_PART3_TO_NAME_INDEX,
_PART3_TO_MACROLANGUAGES,
_PART3_TO_RETIREMENTS,
_PART2B_TO_PART3,
_PART2T_TO_PART3,
_PART1_TO_PART3,
_REF_NAME_TO_PART3,
_PRINT_NAME_TO_PART3,
_INVERTED_NAME_TO_PART3,
_CodesColumn,
_NameIndexColumn,
_RetirementsColumn,
_MacrolanguagesColumn,
_COLUMN_TYPE,
)
_STRING_CLEANING_FUNCS = [
lambda x: x.strip().lower(),
lambda x: x.strip().title(),
]
class LanguageNotFoundError(Exception):
pass
@dataclass(frozen=True)
class Name:
"""Represents an alternative name of a language."""
__slots__ = ("print", "inverted")
print: str
inverted: str
@dataclass(frozen=True)
class Language:
"""Represents a language in the ISO 639-3 charts."""
__slots__ = (
# From the "codes" table
"part3",
"part2b",
"part2t",
"part1",
"scope",
"type",
"status",
"name",
"comment",
# From the "name_index" table
"other_names",
# From the "macrolanguages" table
"macrolanguage",
# From the "retirements" table
"retire_reason",
"retire_change_to",
"retire_remedy",
"retire_date",
)
# From the "codes" table
part3: str
# Although Union[..., None] and Optional[...] are equivalent, I prefer Union.
# Optional simply doesn't sound right, as it would imply that the attribute in
# question is optional, which it's not.
# When support for Python 3.9 is dropped, we will switch to the pipe syntax
# for `... | None`.
part2b: Union[str, None]
part2t: Union[str, None]
part1: Union[str, None]
scope: str
type: Union[str, None]
status: str
name: str
comment: Union[str, None]
# From the "name_index" table
other_names: Union[List[Name], None]
# From the "macrolanguages" table
macrolanguage: Union[str, None]
# From the "retirements" table
retire_reason: Union[str, None]
retire_change_to: Union[str, None]
retire_remedy: Union[str, None]
retire_date: Union[datetime.date, None]
def __hash__(self) -> int:
return hash(self.part3)
def __eq__(self, other) -> bool:
return isinstance(other, Language) and self.part3 == other.part3
@classmethod
def match(cls, user_input: str, /, *, exact: bool = False) -> Language:
"""Return a ``Language`` instance by matching on the user input.
Parameters
----------
user_input : str
A language code or name.
exact : bool, optional
Whether to enforce exact matching against the user input.
Defaults to `False`. If `False`, matching is case-insensitive
and ignores leading/trailing whitespace.
Returns
-------
Language
Notes
-----
At a high level, `Language.match` assumes the input is more likely to be
a language code rather than a language name.
Beyond that, the precise order in matching is as follows:
* ISO 639-3 codes (among the active codes)
* ISO 639-2 (bibliographic) codes
* ISO 639-2 (terminological) codes
* ISO 639-1 codes
* ISO 639-3 codes (among the retired codes)
* ISO 639-3 reference language names
* ISO 639-3 alternative language names (the "print" ones)
* ISO 639-3 alternative language names (the "inverted" ones)
"""
# Order of columns to query the data tables.
# Bias towards (and therefore prioritize) the user input being
# a language code rather than a language name.
query_order: List[_COLUMN_TYPE] = [
_CodesColumn.ID,
_CodesColumn.PART2B,
_CodesColumn.PART2T,
_CodesColumn.PART1,
_RetirementsColumn.ID,
_CodesColumn.REF_NAME,
_NameIndexColumn.PRINT_NAME,
_NameIndexColumn.INVERTED_NAME,
]
return _PART3_TO_LANGUAGES[_get_part3(user_input, query_order, exact)]
@classmethod
def from_part3(cls, user_input: str, /) -> Language:
"""Return a ``Language`` instance from an ISO 639-3 code."""
return _PART3_TO_LANGUAGES[
_get_part3_exact(user_input, [_CodesColumn.ID, _RetirementsColumn.ID])
]
@classmethod
def from_part2b(cls, user_input: str, /) -> Language:
"""Return a ``Language`` instance from an ISO 639-2 (bibliographic) code."""
return _PART3_TO_LANGUAGES[_get_part3_exact(user_input, [_CodesColumn.PART2B])]
@classmethod
def from_part2t(cls, user_input: str, /) -> Language:
"""Return a ``Language`` instance from an ISO 639-2 (terminological) code."""
return _PART3_TO_LANGUAGES[_get_part3_exact(user_input, [_CodesColumn.PART2T])]
@classmethod
def from_part1(cls, user_input: str, /) -> Language:
"""Return a ``Language`` instance from an ISO 639-1 code."""
return _PART3_TO_LANGUAGES[_get_part3_exact(user_input, [_CodesColumn.PART1])]
@classmethod
def from_name(cls, user_input: str, /) -> Language:
"""Return a ``Language`` instance from an ISO 639-3 reference language name."""
query_order: List[_COLUMN_TYPE] = [
_CodesColumn.REF_NAME,
_NameIndexColumn.PRINT_NAME,
_NameIndexColumn.INVERTED_NAME,
]
return _PART3_TO_LANGUAGES[_get_part3_exact(user_input, query_order)]
def _raise_language_not_found_error(user_input: str) -> NoReturn:
raise LanguageNotFoundError(f"{user_input!r} isn't an ISO language code or name")
def _get_part3(
user_input: str, query_order: List[_COLUMN_TYPE], exact: bool = True
) -> str:
"""Get the part 3 code of a language.
Parameters
----------
user_input : str
The user-provided language code or name.
query_order : List[_COLUMN_TYPE]
A list of columns to specify query order.
exact : bool, optional
Whether to enforce exact matching against the user input. Defaults to `True`.
If `False`, basic string cleaning is applied to the user input.
Returns
-------
str
Raises
------
LanguageNotFoundError
If `part3` isn't a language name or code
"""
try:
return _get_part3_exact(user_input, query_order)
except LanguageNotFoundError as e:
if exact:
raise e
else:
for func in _STRING_CLEANING_FUNCS:
try:
return _get_part3_exact(func(user_input), query_order, user_input)
except LanguageNotFoundError:
continue
else:
_raise_language_not_found_error(user_input)
def _get_part3_exact(
user_input: str,
query_order: List[_COLUMN_TYPE],
original_user_input: Optional[str] = None,
) -> str:
"""Get the part 3 code of a language.
Parameters
----------
user_input : str
The user-provided language code or name.
query_order : List[_COLUMN_TYPE]
A list of columns to specify query order.
original_user_input : str, optional
The original user input. Default is `None`.
This argument is used when the user input has been cleaned.
Returns
-------
str
Raises
------
LanguageNotFoundError
If `part3` isn't a language name or code
"""
part3: Union[str, None] = None
for column in query_order:
if column == _CodesColumn.ID:
if user_input in _PART3_TO_CODES:
return user_input
elif column == _CodesColumn.PART2B:
part3 = _PART2B_TO_PART3.get(user_input)
elif column == _CodesColumn.PART2T:
part3 = _PART2T_TO_PART3.get(user_input)
elif column == _CodesColumn.PART1:
part3 = _PART1_TO_PART3.get(user_input)
elif column == _RetirementsColumn.ID:
if user_input in _PART3_TO_RETIREMENTS:
return user_input
elif column == _CodesColumn.REF_NAME:
part3 = _REF_NAME_TO_PART3.get(user_input)
elif column == _NameIndexColumn.PRINT_NAME:
part3 = _PRINT_NAME_TO_PART3.get(user_input)
elif column == _NameIndexColumn.INVERTED_NAME:
part3 = _INVERTED_NAME_TO_PART3.get(user_input)
else:
raise ValueError(f"Invalid column: {column}")
if part3 is not None:
break
if part3 is None:
_raise_language_not_found_error(original_user_input or user_input)
return part3
def _get_language(part3: str) -> Language:
"""Create a ``Language`` instance.
Parameters
----------
part3 : str
Part 3 code of the language.
Returns
-------
Language
"""
from_codes = _PART3_TO_CODES.get(part3)
from_macrolanguages = _PART3_TO_MACROLANGUAGES.get(part3)
from_retirements = _PART3_TO_RETIREMENTS.get(part3)
ref_name = (
from_codes[_CodesColumn.REF_NAME]
if from_codes
else from_retirements[_RetirementsColumn.REF_NAME] # type: ignore
)
other_names: Union[List[Name], None] = []
for row in _PART3_TO_NAME_INDEX.get(part3, []):
p, i = row[_NameIndexColumn.PRINT_NAME], row[_NameIndexColumn.INVERTED_NAME]
if not ref_name == p == i:
other_names.append(Name(p, i)) # type: ignore
other_names = other_names or None
macrolanguage = (from_macrolanguages or {}).get(_MacrolanguagesColumn.MID)
retire_reason = (from_retirements or {}).get(_RetirementsColumn.RET_REASON)
retire_change_to = (from_retirements or {}).get(_RetirementsColumn.CHANGE_TO)
retire_remedy = (from_retirements or {}).get(_RetirementsColumn.REMEDY)
retire_date = (
datetime.datetime.strptime(
from_retirements[_RetirementsColumn.EFFECTIVE], "%Y-%m-%d"
).date()
if from_retirements
else None
)
if from_codes:
# The ISO 639-3 code is active.
part2b = from_codes[_CodesColumn.PART2B]
part2t = from_codes[_CodesColumn.PART2T]
part1 = from_codes[_CodesColumn.PART1]
scope = from_codes[_CodesColumn.SCOPE]
type = from_codes[_CodesColumn.TYPE]
status = "A"
ref_name = ref_name
comment = from_codes[_CodesColumn.COMMENT]
else:
# The ISO 639-3 code is retired.
part2b = None
part2t = None
part1 = None
scope = "I"
type = None
status = "R"
ref_name = ref_name
comment = None
language = Language(
part3=part3,
part2b=part2b or None,
part2t=part2t or None,
part1=part1 or None,
scope=scope,
type=type or None,
status=status,
name=ref_name,
comment=comment or None,
other_names=other_names or None,
macrolanguage=macrolanguage or None,
retire_reason=retire_reason or None,
retire_change_to=retire_change_to or None,
retire_remedy=retire_remedy or None,
retire_date=retire_date or None,
)
return language
def _get_all_languages() -> Dict[str, Language]:
languages = {}
for part3 in _PART3_TO_CODES:
languages[part3] = _get_language(part3)
for part3 in _PART3_TO_RETIREMENTS:
languages[part3] = _get_language(part3)
return languages
_PART3_TO_LANGUAGES: Dict[str, Language] = _get_all_languages()
ALL_LANGUAGES: Set[Language] = set(_PART3_TO_LANGUAGES.values())