# SPDX-License-Identifier: GPL-3.0-or-later
"""
Split forms into groups of up to six forms per query based on identifiers.
"""
from collections import defaultdict
from pathlib import Path
from scribe_data.check.check_missing_forms.generate_query import generate_query
[docs]
def split_group_by_identifier(
language_entry: dict,
output_dir: Path,
sub_lang_iso_code: str | None = None,
) -> None:
"""
Split forms into groups of up to six forms per query based on identifiers.
Parameters
----------
language_entry : dict
Dictionary containing language data with missing features.
Format: {language_qid: {data_type_qid: [features]}}.
output_dir : Path
Directory where generated query files should be saved.
sub_lang_iso_code : str, optional
ISO code for sub-language if applicable.
Notes
-----
Groups forms based on their identifiers to avoid generating too many queries.
Combines small groups when possible to reduce the number of query files.
"""
for lang, data in language_entry.items():
for data_type, missing_features_list in data.items():
# Group features by their first identifier.
identifier_groups: defaultdict[str, list] = defaultdict(list)
# First try to group by the first identifier in each feature list.
for feature_list in missing_features_list:
if feature_list: # skip empty lists
# Use the first identifier as the grouping key.
key = feature_list[0]
identifier_groups[key].append(feature_list)
# Now check if any groups have more than 6 features.
final_groups: list[list] = []
for features in identifier_groups.values():
if len(features) <= 6:
# This group is small enough so keep it as is.
final_groups.append(features)
else:
# This group is too large so it needs to split further by the second identifier.
second_level_groups: defaultdict[str, list] = defaultdict(list)
for feature_list in features:
if len(feature_list) > 1:
# Use the second identifier for further grouping.
second_key = feature_list[1]
second_level_groups[second_key].append(feature_list)
else:
# If there's only one identifier, make it its own group.
second_level_groups["single_identifier"].append(
feature_list
)
# Further split if necessary and add to final groups.
for second_features in second_level_groups.values():
# Split into chunks of 6.
for i in range(0, len(second_features), 6):
chunk = second_features[i : i + 6]
final_groups.append(chunk)
# Now combine small groups if possible to reduce query files.
optimized_groups: list[list] = []
current_group: list = []
# Sort groups by size to try combining smaller ones first.
final_groups.sort(key=len)
for group in final_groups:
if len(current_group) + len(group) <= 6:
# Can add this group to the current one.
current_group.extend(group)
else:
# Current group is full, so start a new one.
if current_group:
optimized_groups.append(current_group)
current_group = group
# Add the last group if not empty.
if current_group:
optimized_groups.append(current_group)
# Generate queries for each optimized group.
for i, group in enumerate(optimized_groups):
# Create a new language entry for this group.
group = group
group_entry = {lang: {data_type: group}}
print(
f"Generating query {i + 1}/{len(optimized_groups)} for {lang} - {data_type} with {len(group)} features"
)
# Call generate_query with the grouped features.
generate_query(
missing_features=group_entry,
query_dir=output_dir,
sub_lang_iso_code=sub_lang_iso_code,
)