Source code for scribe_data.check.check_missing_forms.split_query

# SPDX-License-Identifier: GPL-3.0-or-later
"""
Split forms into groups of up to six forms per query based on identifiers.
"""

from collections import defaultdict

from scribe_data.check.check_missing_forms.generate_query import generate_query



[docs]
def split_group_by_identifier(language_entry, output_dir, sub_lang_iso_code=None):
    """
    Split forms into groups of up to six forms per query based on identifiers.

    Parameters
    ----------
    language_entry : dict
        Dictionary containing language data with missing features.
        Format: {language_qid: {data_type_qid: [features]}}.

    output_dir : str or Path
        Directory where generated query files should be saved.

    sub_lang_iso_code : str, optional
        ISO code for sub-language if applicable.

    Notes
    -----
    Groups forms based on their identifiers to avoid generating too many queries.
    Combines small groups when possible to reduce the number of query files.
    """
    for lang, data in language_entry.items():
        for data_type, missing_features_list in data.items():
            # Group features by their first identifier.
            identifier_groups = defaultdict(list)

            # First try to group by the first identifier in each feature list.
            for feature_list in missing_features_list:
                if feature_list:  # skip empty lists
                    # Use the first identifier as the grouping key.
                    key = feature_list[0]
                    identifier_groups[key].append(feature_list)

            # Now check if any groups have more than 6 features.
            final_groups = []

            for features in identifier_groups.values():
                if len(features) <= 6:
                    # This group is small enough so keep it as is.
                    final_groups.append(features)

                else:
                    # This group is too large so it needs to split further by the second identifier.
                    second_level_groups = defaultdict(list)

                    for feature_list in features:
                        if len(feature_list) > 1:
                            # Use the second identifier for further grouping.
                            second_key = feature_list[1]
                            second_level_groups[second_key].append(feature_list)

                        else:
                            # If there's only one identifier, make it its own group.
                            second_level_groups["single_identifier"].append(
                                feature_list
                            )

                    # Further split if necessary and add to final groups.
                    for second_features in second_level_groups.values():
                        # Split into chunks of 6.
                        for i in range(0, len(second_features), 6):
                            chunk = second_features[i : i + 6]
                            final_groups.append(chunk)

            # Now combine small groups if possible to reduce query files.
            optimized_groups = []
            current_group = []

            # Sort groups by size to try combining smaller ones first.
            final_groups.sort(key=len)

            for group in final_groups:
                if len(current_group) + len(group) <= 6:
                    # Can add this group to the current one.
                    current_group.extend(group)

                else:
                    # Current group is full, so start a new one.
                    if current_group:
                        optimized_groups.append(current_group)
                    current_group = group

            # Add the last group if not empty.
            if current_group:
                optimized_groups.append(current_group)

            # Generate queries for each optimized group.
            for i, group in enumerate(optimized_groups):
                # Create a new language entry for this group.
                group = group
                group_entry = {lang: {data_type: group}}

                print(
                    f"Generating query {i+1}/{len(optimized_groups)} for {lang} - {data_type} with {len(group)} features"
                )

                # Call generate_query with the grouped features.
                generate_query(
                    group_entry,
                    output_dir,
                    sub_lang_iso_code=sub_lang_iso_code,
                )