Source code for scribe_data.cli.download.wikidata_lexeme_dump

# SPDX-License-Identifier: GPL-3.0-or-later
"""
Functions for downloading Wikidata lexeme dumps.
"""

import contextlib
import os
import re
from collections.abc import Callable
from datetime import date, datetime
from pathlib import Path

import questionary
import requests
from rich import print as rprint
from tqdm import tqdm

from scribe_data.utils import (
    DEFAULT_WIKIDATA_DUMP_EXPORT_DIR,
    DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR,
    check_lexeme_dump_prompt_download,
)


[docs] def parse_date(date_string: str) -> date | None: """ Parse a date string into a datetime.date object (formats: YYYYMMDD, YYYY/MM/DD, YYYY-MM-DD). Parameters ---------- date_string : str The date string to be parsed. Returns ------- datetime.date Parsed date object if the format is valid. None If the date format is invalid. """ formats = ["%Y%m%d", "%Y/%m/%d", "%Y-%m-%d"] for fmt in formats: try: return datetime.strptime(date_string, fmt).date() except ValueError: continue print( f"Invalid date format: {date_string}. Expected formats: YYYYMMDD, YYYY/MM/DD, or YYYY-MM-DD." ) return None
[docs] def available_closest_lexeme_dump_file( target_entity: str, other_old_dumps: list, check_wd_dump_exists: Callable[[str], str | None], ) -> str | None: """ Find the closest available dump file based on the target date. Parameters ---------- target_entity : str The target date for which the dump is requested (format: YYYY/MM/DD or similar). other_old_dumps : list List of available dump folders as strings. check_wd_dump_exists : function A function to validate if the dump file exists. Returns ------- str The closest available dump file date (as a string). None If no suitable dump is found. """ target_date = parse_date(target_entity) closest_date = None closest_diff = None if target_date: available_dates = [] for i in other_old_dumps: if i == "..": continue with contextlib.suppress(requests.exceptions.HTTPError): if check_wd_dump_exists(i): available_dates.append(i) current_date = parse_date(i) if current_date and target_date: diff = abs((current_date - target_date).days) if closest_diff is None or diff < closest_diff: closest_date = i closest_diff = diff if current_date >= target_date: break return closest_date
[docs] def download_wd_lexeme_dump( target_entity: str = "latest-lexemes", ) -> str | None: """ Download a Wikimedia lexeme dump based on the specified target entity or date. Parameters ---------- target_entity : str, optional The target dump to download. Defaults to "latest-lexemes". - If "latest-lexemes", downloads the latest dump. - If a valid date (e.g., YYYYMMDD), attempts to download the dump for that date. Returns ------- str The URL of the requested or closest available dump. None If no suitable dump is found or the request fails. """ base_url = "https://dumps.wikimedia.org/wikidatawiki/entities" def check_wd_dump_exists(target_entity: str) -> str | None: """ Check if the specified dump file exists for a target entity. Parameters ---------- target_entity : str The target entity or date folder to check. Returns ------- str The URL of the dump file if it exists. None If the dump file does not exist. """ entity_url = f"{base_url}/{target_entity}/" entity_response = requests.get(entity_url, timeout=30) entity_response.raise_for_status() dump_filenames = re.findall(r'href="([^"]+)"', entity_response.text) file_url = f"wikidata-{target_entity}-lexemes.json.bz2" if file_url in dump_filenames: return f"{base_url}/{target_entity}/{file_url}" if target_entity != "latest-lexemes": try: if parse_date(target_entity): target_entity = target_entity.replace("/", "").replace("-", "") return check_wd_dump_exists(target_entity) except requests.exceptions.HTTPError as http_err: print( f"HTTP error occurred: {http_err} Status code: " f"{http_err.response.status_code if http_err.response else 'Unknown'}" ) print("We could not find your requested Wikidata lexeme dump.") response = requests.get(base_url, timeout=30) other_old_dumps = re.findall(r'href="([^"]+)/"', response.text) user_response = questionary.confirm( "Do you want to see the closest available older dumps?", default=True ).ask() if not user_response: return if other_old_dumps: if closest_date := available_closest_lexeme_dump_file( target_entity, other_old_dumps, check_wd_dump_exists ): print( f"\nClosest available older dumps(YYYYMMDD): {parse_date(closest_date)}" ) fileurl = f"{closest_date}/wikidata-{closest_date}-lexemes.json.bz2" if closest_date: return f"{base_url}/{fileurl}" else: return try: response = requests.get(base_url, timeout=30) response.raise_for_status() latest_dump = re.findall(r'href="([^"]+)"', response.text) if "latest-all.json.bz2" in latest_dump: latest_dump_link = f"{base_url}/latest-lexemes.json.bz2" return latest_dump_link except requests.exceptions.RequestException as e: print(f"An error occurred: {e}")
[docs] def wd_lexeme_dump_download_wrapper( dump_snapshot: str | None = None, output_dir: Path | None = DEFAULT_WIKIDATA_DUMP_EXPORT_DIR, default: bool = False, ) -> Path | bool | None: """ Download Wikidata lexeme dumps given user preferences. Parameters ---------- dump_snapshot : str Optional date string in YYYYMMDD format for specific dumps. output_dir : Path Optional directory path for the downloaded file. Defaults to 'scribe_data_wikidata_dumps_export' directory. default : bool, optional If True, skips the user confirmation prompt. Defaults to False. Returns ------- Path or None - If successful and a dump is downloaded, returns the file path to the downloaded dump. - If an existing usable dump is detected, returns the path to the existing dump. - Returns None if the user chooses not to proceed with the download or no valid dump URL is found. """ try: output_dir = output_dir or DEFAULT_WIKIDATA_DUMP_EXPORT_DIR os.makedirs(output_dir, exist_ok=True) # Don't check for lexeme if date given. if not dump_snapshot: if useable_file_dir := check_lexeme_dump_prompt_download(output_dir): return useable_file_dir dump_url = download_wd_lexeme_dump(dump_snapshot or "latest-lexemes") if not dump_url: rprint("[bold red]No dump URL found.[/bold red]") return None filename = dump_url.split("/")[-1] output_path = ( output_dir / filename if output_dir else DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR / filename ) # Use default parameter to bypass user confirmation. user_response = ( default or questionary.confirm( "We'll be using the Wikidata lexeme dump from dumps.wikimedia.org/wikidatawiki/entities. Do you want to proceed?", default=True, ).ask() ) if user_response: rprint(f"[bold blue]Downloading dump to {output_path}...[/bold blue]") response = requests.get(dump_url, stream=True, timeout=30) total_size = int(response.headers.get("content-length", 0)) with open(output_path, "wb") as f: with tqdm( total=total_size, unit="iB", unit_scale=True, desc=output_path ) as pbar: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) pbar.update(len(chunk)) rprint( "[bold green]Wikidata lexeme dump download completed successfully![/bold green]" ) return output_path else: return except requests.exceptions.RequestException as e: rprint(f"[bold red]Error downloading dump: {e}[/bold red]") except Exception as e: rprint(f"[bold red]An error occurred: {e}[/bold red]")