Source code for scribe_data.check.check_missing_forms.get_forms

# SPDX-License-Identifier: GPL-3.0-or-later
"""
Get forms from Wikidata SPARQL query files.
"""

import re
from collections import defaultdict

from scribe_data.utils import (
    WIKIDATA_QUERIES_ALL_DATA_DIR,
    language_metadata,
)

iso_to_qid = {
    lang_data["iso"]: lang_data["qid"]
    for lang, lang_data in language_metadata.items()
    if "iso" in lang_data and "qid" in lang_data
}


[docs] def parse_sparql_files() -> dict: """ Read and parse all SPARQL query files to extract form information. Returns ------- dict Accumulated forms for each language and lexical category. Format: {language: {lexical_category: [forms]}}. Notes ----- Recursively searches through WIKIDATA_QUERIES_ALL_DATA_DIR directory for .sparql files and accumulates all form information. """ all_forms = defaultdict(lambda: defaultdict(list)) for sub_sub_file in WIKIDATA_QUERIES_ALL_DATA_DIR.rglob("*.sparql"): with open(sub_sub_file, "r", encoding="utf-8") as query_text: result = parse_sparql_query(query_text.read()) # Accumulate forms for each language and lexical category. for lang, categories in result.items(): for category, forms in categories.items(): if forms: all_forms[lang][category].extend(forms) return all_forms
[docs] def parse_sparql_query(query_text: str) -> dict: """ Parse a SPARQL query to extract lexical categories and features. Parameters ---------- query_text : str Content of the SPARQL query file. Returns ------- dict Dictionary containing parsed information. Format: {language: {lexical_category: [forms]}}. Notes ----- Extracts: - Language QID - Lexical category QID - Grammatical features from OPTIONAL blocks """ # Get language and category first. language = None lexical_category = None # Parse lexical category. lexical_matches = re.finditer(r"wikibase:lexicalCategory\s+wd:(Q\d+)", query_text) for match in lexical_matches: lexical_category = match.group(1) # Parse language. language_matches = re.finditer(r"dct:language\s+wd:(Q\d+)", query_text) for match in language_matches: language = match.group(1) result = {language: {lexical_category: []}} # Parse optional blocks for forms and features. optional_blocks = re.finditer(r"OPTIONAL\s*{([^}]+)}", query_text) for block in optional_blocks: block_text = block.group(1) # Extract grammatical features. features = re.finditer(r"wd:(Q\d+)", block_text) if feature_list := [f.group(1) for f in features]: result[language][lexical_category].append(feature_list) return result