Source code for scribe_data.cli.download.wiktionary_dump

# SPDX-License-Identifier: GPL-3.0-or-later
"""
Functions for downloading Wiktionary dumps.
"""

from pathlib import Path

import questionary
import requests
from rich import print as rprint
from tqdm import tqdm

from scribe_data.utils import (
    DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR,
    resolve_lang_iso,
)



[docs]
def download_wiktionary_dumps(
    output_dir: Path = DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR,
    language_isos: list[str] = ["en"],
    dump_snapshot: str | None = "latest",
) -> Path | None:
    """
    Download the latest Wiktionary pages-articles dump based on passed language isos.

    Parameters
    ----------
    output_dir : Path, optional, default=DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR
        Directory to save the dump. Defaults to DEFAULT_WIKTIONARY_DUMP_EXPORT_DIR.

    language_isos : List[str], optional, default=['en']
        A list of ISO-2 codes for desired Wiktionary dumps.

    dump_snapshot : str, optional, default='latest'
        The Wiktionary dump snapshot to be downloaded.

    Returns
    -------
    Path
        Path to the downloaded file, or None if aborted/failed.
    """
    if isinstance(language_isos, str):
        language_isos = [language_isos]

    resolved_isos = []
    not_included_isos = []
    for lang in language_isos:
        iso = resolve_lang_iso(lang)
        if iso:
            resolved_isos.append(iso)

        else:
            not_included_isos.append(lang)

    if not_included_isos:
        iso_or_isos = "iso" if len(not_included_isos) == 1 else "isos"
        is_or_are = "is" if len(not_included_isos) == 1 else "are"
        rprint(
            f"[bold red]The following {iso_or_isos} {is_or_are} not included: {', '.join(not_included_isos)}[/bold red]"
        )
        return None

    language_isos = resolved_isos
    wiktionaries = [f"{iso}wiktionary" for iso in language_isos]
    wiktionary_urls = [f"https://dumps.wikimedia.org/{w}" for w in wiktionaries]

    Path(output_dir).mkdir(parents=True, exist_ok=True)
    for i, w, u in zip(language_isos, wiktionaries, wiktionary_urls):
        # Note: Remove the snapshot from the resulting filename so Scribe-Server always looks for one file.
        filename = f"{w}-pages-articles.xml.bz2"
        download_filename = f"{w}-{dump_snapshot}-pages-articles.xml.bz2"
        download_url = f"{u}/{dump_snapshot}/{download_filename}"

        rprint(f"[bold blue]Checking dump validity at {download_url}...[/bold blue]")
        try:
            response = requests.head(download_url, timeout=30)
            response.raise_for_status()

        except requests.exceptions.RequestException as e:
            rprint(f"[bold red]Invalid dump date or dump not found: {e}[/bold red]")
            return None

        output_path = output_dir / filename

        if output_path.exists():
            rprint(f"[bold yellow]Dump already exists: {output_path}[/bold yellow]")
            user_input = questionary.select(
                "Do you want to:",
                choices=[
                    "Skip download",
                    "Download and overwrite",
                ],
            ).ask()
            if user_input == "Skip download":
                rprint("[bold green]Skipping download.[/bold green]")
                return output_path

        rprint(f"[bold blue]Downloading to {output_path}...[/bold blue]")
        try:
            response = requests.get(download_url, stream=True, timeout=30)
            response.raise_for_status()
            total_size = int(response.headers.get("content-length", 0))

            with open(output_path, "wb") as f:
                with tqdm(
                    total=total_size,
                    unit="iB",
                    unit_scale=True,
                    desc=download_filename,
                ) as pbar:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                            pbar.update(len(chunk))

            rprint(
                f"[bold green]{i.upper()}Wiktionary dump download completed successfully![/bold green]"
            )
            return output_path

        except requests.exceptions.RequestException as e:
            rprint(f"[bold red]Download failed: {e}[/bold red]")
            return None

    iso_or_isos = "iso" if len(not_included_isos) == 1 else "isos"
    iso_or_isos = "iso" if len(language_isos) == 1 else "isos"
    was_or_were = "was" if len(language_isos) == 1 else "were"
    rprint(
        f"[bold green]The following {iso_or_isos} {was_or_were} successfully downloaded: {', '.join(language_isos)}[/bold green]"
    )