# SPDX-License-Identifier: GPL-3.0-or-later
"""
Functions for checking data exports against their contracts.
"""
import json
from pathlib import Path
from scribe_data.cli.contracts.filter import (
DEFAULT_DATA_CONTRACTS_DIR,
DEFAULT_JSON_EXPORT_DIR,
filter_contract_metadata,
)
from scribe_data.utils import get_language_from_iso, get_language_iso
data_contracts_langs = [
f.stem for f in DEFAULT_DATA_CONTRACTS_DIR.iterdir() if f.is_file()
]
for i in range(len(data_contracts_langs)):
data_contracts_langs[i] = get_language_from_iso(data_contracts_langs[i])
[docs]
def check_contracts(output_dir: str | None = None) -> None:
"""
Check data contracts in the specified or default output directory to ensure data completeness.
Parameters
----------
output_dir : Optional[str], optional
Directory containing exported contract data.
If None, uses the default DEFAULT_JSON_EXPORT_DIR.
"""
export_dir = Path(output_dir or DEFAULT_JSON_EXPORT_DIR)
if not export_dir.exists():
print(
f"Error: Directory {export_dir} does not exist.\nPlease use export JSON first."
)
return
missing_forms = check_contract_data_completeness(export_dir)
print_missing_forms(missing_forms)
[docs]
def check_contract_data_completeness(
export_dir: Path, language: str | None = None
) -> dict[str, dict[str, list[str]]]:
"""
Validate exported data contracts against their metadata requirements.
This function checks if the exported data for a given language (or all languages)
contains all the required forms specified in the data contracts.
Parameters
----------
export_dir : Path
Directory containing exported contract data.
language : Optional[str], optional
Specific language to check. If None, checks all languages in the directory.
Returns
-------
Dict[str, Dict[str, List[str]]]
A nested dictionary containing missing forms by language and data type.
{
'Language Name': {
'nouns': ['missing_noun_form1', 'missing_noun_form2'],
'verbs': ['missing_verb_form1']
}
}
The above is the expected structure.
"""
# Determine languages to check.
if language:
languages_to_check = [language]
elif export_dir.exists():
unique_dirs = {}
for item in export_dir.iterdir():
if item.is_dir():
lower_name = item.name.lower()
# Prioritize strictly lowercase directory names to avoid checking capitalized duplicates.
if lower_name not in unique_dirs or item.name == lower_name:
unique_dirs[lower_name] = item.name
languages_to_check = list(unique_dirs.values())
else:
languages_to_check = [
Path(f).stem.lower() for f in DEFAULT_DATA_CONTRACTS_DIR.glob("*.yaml")
]
languages_to_check = [
lang
for lang in languages_to_check
if lang.lower() in [lang_item.lower() for lang_item in data_contracts_langs]
]
missing_forms = {}
for lang_dir_name in languages_to_check:
lang = " ".join(word.capitalize() for word in lang_dir_name.split("_"))
# Get ISO code and contract file.
try:
iso_code = get_language_iso(lang.lower())
contract_file = DEFAULT_DATA_CONTRACTS_DIR / f"{iso_code.lower()}.yaml"
if not contract_file.exists():
print(f"Warning: No contract file found for {lang}")
continue
except ValueError:
print(f"Warning: Could not find ISO code for {lang}")
continue
# Get contract metadata.
contract_metadata = filter_contract_metadata(contract_file)
export_lang_dir = export_dir / lang_dir_name
# Check missing forms for nouns and verbs.
lang_missing_forms = {}
for data_type in ["nouns", "verbs"]:
# Determine required forms.
required_forms = (
contract_metadata["nouns"]["numbers"]
+ contract_metadata["nouns"]["genders"]
if data_type == "nouns"
else contract_metadata["verbs"]["conjugations"]
)
exported_data_file = export_lang_dir / f"{data_type}.json"
if not exported_data_file.exists():
print(f"Warning: No exported data found for {lang} {data_type}")
if required_forms:
lang_missing_forms[data_type] = required_forms
continue
try:
with open(exported_data_file, "r", encoding="utf-8") as f:
exported_data = json.load(f)
except (json.JSONDecodeError, IOError) as e:
print(f"Error reading {exported_data_file}: {e}")
if required_forms:
lang_missing_forms[data_type] = required_forms
continue
if missing_type_forms := [
form
for form in required_forms
if all(
form not in lexeme_data for lexeme_data in exported_data.values()
)
]:
lang_missing_forms[data_type] = missing_type_forms
# Add to overall missing forms if any.
if lang_missing_forms:
missing_forms[lang] = lang_missing_forms
return missing_forms