Source code for scribe_data.cli.download.wiktionary_dump

# SPDX-License-Identifier: GPL-3.0-or-later
"""
Functions for downloading Wiktionary dumps.
"""

from pathlib import Path

import questionary
import requests
from rich import print as rprint
from tqdm import tqdm

from scribe_data.utils import (
    DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR,
    resolve_lang_iso,
)


[docs] def download_wiktionary_dumps( output_dir: Path = DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR, language_isos: list[str] = ["en"], dump_snapshot: str | None = "latest", ) -> Path | None: """ Download the latest Wiktionary pages-articles dump based on passed language isos. Parameters ---------- output_dir : Path, optional, default=DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR Directory to save the dump. Defaults to DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR. language_isos : List[str], optional, default=['en'] A list of ISO-2 codes for desired Wiktionary dumps. dump_snapshot : str, optional, default='latest' The Wiktionary dump snapshot to be downloaded. Returns ------- Path Path to the downloaded file, or None if aborted/failed. """ if isinstance(language_isos, str): language_isos = [language_isos] resolved_isos = [] not_included_isos = [] for lang in language_isos: iso = resolve_lang_iso(lang) if iso: resolved_isos.append(iso) else: not_included_isos.append(lang) if not_included_isos: iso_or_isos = "iso" if len(not_included_isos) == 1 else "isos" is_or_are = "is" if len(not_included_isos) == 1 else "are" rprint( f"[bold red]The following {iso_or_isos} {is_or_are} not included: {', '.join(not_included_isos)}[/bold red]" ) return None language_isos = resolved_isos wiktionaries = [f"{iso}wiktionary" for iso in language_isos] wiktionary_urls = [f"https://dumps.wikimedia.org/{w}" for w in wiktionaries] Path(output_dir).mkdir(parents=True, exist_ok=True) for i, w, u in zip(language_isos, wiktionaries, wiktionary_urls): # Note: Remove the snapshot from the resulting filename so Scribe-Server always looks for one file. filename = f"{w}-pages-articles.xml.bz2" download_filename = f"{w}-{dump_snapshot}-pages-articles.xml.bz2" download_url = f"{u}/{dump_snapshot}/{download_filename}" rprint(f"[bold blue]Checking dump validity at {download_url}...[/bold blue]") try: response = requests.head(download_url, timeout=30) response.raise_for_status() except requests.exceptions.RequestException as e: rprint(f"[bold red]Invalid dump date or dump not found: {e}[/bold red]") return None output_path = output_dir / filename if output_path.exists(): rprint(f"[bold yellow]Dump already exists: {output_path}[/bold yellow]") user_input = questionary.select( "Do you want to:", choices=[ "Skip download", "Download and overwrite", ], ).ask() if user_input == "Skip download": rprint("[bold green]Skipping download.[/bold green]") return output_path rprint(f"[bold blue]Downloading to {output_path}...[/bold blue]") try: response = requests.get(download_url, stream=True, timeout=30) response.raise_for_status() total_size = int(response.headers.get("content-length", 0)) with open(output_path, "wb") as f: with tqdm( total=total_size, unit="iB", unit_scale=True, desc=download_filename, ) as pbar: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) pbar.update(len(chunk)) rprint( f"[bold green]{i.upper()}Wiktionary dump download completed successfully![/bold green]" ) return output_path except requests.exceptions.RequestException as e: rprint(f"[bold red]Download failed: {e}[/bold red]") return None iso_or_isos = "iso" if len(not_included_isos) == 1 else "isos" iso_or_isos = "iso" if len(language_isos) == 1 else "isos" was_or_were = "was" if len(language_isos) == 1 else "were" rprint( f"[bold green]The following {iso_or_isos} {was_or_were} successfully downloaded: {', '.join(language_isos)}[/bold green]" )