| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157 |
- from __future__ import annotations
- from copy import deepcopy
- from datetime import datetime, timezone
- from pathlib import Path
- from typing import Any
- from analysis.data_pipeline import build_dataset, build_dataset_splits
- def _percent(part: int, whole: int) -> float:
- if whole <= 0:
- return 0.0
- return 100.0 * float(part) / float(whole)
- def compute_dataset_summary(
- config: dict[str, Any],
- root_dir: Path,
- positive_class_index: int,
- ) -> dict[str, Any]:
- # Force CPU so summary generation works without GPU availability.
- summary_config = deepcopy(config)
- summary_config.setdefault("training", {})
- summary_config["training"]["device"] = "cpu"
- dataset, xls_file = build_dataset(summary_config, root_dir)
- seed = int(config["data"]["seed"])
- splits = build_dataset_splits(summary_config, dataset, xls_file, seed=seed)
- split_names = ["train", "validation", "test"]
- requested_ratios = [float(v) for v in config["data"]["data_splits"]]
- if len(splits) != 3:
- raise ValueError(f"Expected 3 dataset splits, got {len(splits)}.")
- total_images = int(len(dataset))
- labels = (dataset.expected_classes[:, positive_class_index] >= 0.5).int()
- splits_summary: list[dict[str, Any]] = []
- assigned = 0
- assigned_positive = 0
- for split_name, requested_ratio, subset in zip(
- split_names,
- requested_ratios,
- splits,
- strict=True,
- ):
- indices = list(subset.indices)
- split_count = int(len(indices))
- split_positive = int(labels[indices].sum().item()) if split_count > 0 else 0
- split_negative = split_count - split_positive
- assigned += split_count
- assigned_positive += split_positive
- splits_summary.append(
- {
- "split": split_name,
- "requested_ratio": requested_ratio,
- "image_count": split_count,
- "image_pct_of_dataset": _percent(split_count, total_images),
- "positive_count": split_positive,
- "negative_count": split_negative,
- "positive_pct_within_split": _percent(split_positive, split_count),
- "negative_pct_within_split": _percent(split_negative, split_count),
- }
- )
- if assigned != total_images:
- raise ValueError(
- f"Split coverage mismatch: assigned {assigned} images, expected {total_images}."
- )
- total_positive = assigned_positive
- total_negative = total_images - total_positive
- return {
- "generated_utc": datetime.now(timezone.utc).isoformat(),
- "seed": seed,
- "positive_class_index": int(positive_class_index),
- "totals": {
- "image_count": total_images,
- "positive_count": total_positive,
- "negative_count": total_negative,
- "positive_pct": _percent(total_positive, total_images),
- "negative_pct": _percent(total_negative, total_images),
- },
- "splits": splits_summary,
- }
- def _markdown_table(summary: dict[str, Any]) -> str:
- lines = [
- "| Split | Requested % | Images | Dataset % | Positive | Negative | Positive % (split) | Negative % (split) |",
- "|---|---:|---:|---:|---:|---:|---:|---:|",
- ]
- for item in summary["splits"]:
- lines.append(
- "| "
- f"{item['split'].title()} | "
- f"{item['requested_ratio'] * 100.0:.2f}% | "
- f"{item['image_count']} | "
- f"{item['image_pct_of_dataset']:.2f}% | "
- f"{item['positive_count']} | "
- f"{item['negative_count']} | "
- f"{item['positive_pct_within_split']:.2f}% | "
- f"{item['negative_pct_within_split']:.2f}% |"
- )
- return "\n".join(lines)
- def write_dataset_summary_markdown(summary: dict[str, Any], path: Path) -> None:
- totals = summary["totals"]
- lines = [
- "# Dataset Composition Summary",
- "",
- f"Generated (UTC): {summary['generated_utc']}",
- f"Split seed: {summary['seed']}",
- f"Positive class index: {summary['positive_class_index']}",
- "",
- "## Overall",
- "",
- f"- Total images: {totals['image_count']}",
- f"- Positive images: {totals['positive_count']} ({totals['positive_pct']:.2f}%)",
- f"- Negative images: {totals['negative_count']} ({totals['negative_pct']:.2f}%)",
- "",
- "## Train / Validation / Test Breakdown",
- "",
- _markdown_table(summary),
- "",
- ]
- path.write_text("\n".join(lines), encoding="utf-8")
- def run_dataset_summary(
- config: dict[str, Any],
- root_dir: Path,
- output_dir: Path,
- positive_class_index: int,
- ) -> dict[str, Any]:
- summary = compute_dataset_summary(
- config=config,
- root_dir=root_dir,
- positive_class_index=positive_class_index,
- )
- markdown_path = output_dir / "dataset_summary.md"
- write_dataset_summary_markdown(summary, markdown_path)
- return {
- "summary_markdown": str(markdown_path),
- "summary": summary,
- }
|