| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195 |
- from __future__ import annotations
- import argparse
- import json
- import os
- import pathlib as pl
- import subprocess
- import sys
- import time
- from datetime import datetime, timezone
- def utc_now_iso() -> str:
- return datetime.now(timezone.utc).isoformat()
- def parse_args() -> argparse.Namespace:
- parser = argparse.ArgumentParser(
- description=(
- "Run ensemble training followed by bayesian training using existing scripts."
- )
- )
- parser.add_argument(
- "--python",
- default=sys.executable,
- help="Python executable to use for child scripts (default: current interpreter).",
- )
- parser.add_argument(
- "--workdir",
- default=str(pl.Path(__file__).resolve().parent),
- help="Working directory for training scripts (default: this script directory).",
- )
- parser.add_argument(
- "--log-dir",
- default=None,
- help="Directory for log and summary files (default: <workdir>/logs).",
- )
- parser.add_argument(
- "--continue-on-error",
- action="store_true",
- help="Continue to the next stage even if a stage fails.",
- )
- parser.add_argument(
- "--dry-run",
- action="store_true",
- help="Print resolved commands and paths without running training.",
- )
- return parser.parse_args()
- def run_stage(
- stage_name: str,
- command: list[str],
- workdir: pl.Path,
- log_file: pl.Path,
- ) -> dict[str, object]:
- started = utc_now_iso()
- start_time = time.monotonic()
- log_file.parent.mkdir(parents=True, exist_ok=True)
- with open(log_file, "w", encoding="utf-8") as log:
- log.write(f"[{started}] Starting stage: {stage_name}\n")
- log.write(f"Command: {' '.join(command)}\n")
- log.write(f"Working directory: {workdir}\n\n")
- process = subprocess.Popen(
- command,
- cwd=str(workdir),
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- text=True,
- bufsize=1,
- universal_newlines=True,
- env=os.environ.copy(),
- )
- if process.stdout is not None:
- for line in process.stdout:
- print(line, end="")
- log.write(line)
- return_code = process.wait()
- finished = utc_now_iso()
- duration_seconds = time.monotonic() - start_time
- log.write(
- (
- "\n"
- f"[{finished}] Finished stage: {stage_name}\n"
- f"Exit code: {return_code}\n"
- f"Duration seconds: {duration_seconds:.2f}\n"
- )
- )
- return {
- "stage": stage_name,
- "command": command,
- "started_at_utc": started,
- "finished_at_utc": finished,
- "duration_seconds": duration_seconds,
- "exit_code": return_code,
- "status": "success" if return_code == 0 else "failed",
- "log_file": str(log_file),
- }
- def main() -> int:
- args = parse_args()
- workdir = pl.Path(args.workdir).resolve()
- log_dir = (
- pl.Path(args.log_dir).resolve()
- if args.log_dir is not None
- else workdir / "logs"
- )
- run_stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- run_log_dir = log_dir / f"overnight_{run_stamp}"
- ensemble_script = workdir / "train_ensemble.py"
- bayesian_script = workdir / "train_bayesian.py"
- for script in (ensemble_script, bayesian_script):
- if not script.exists():
- print(f"Error: required script not found: {script}")
- return 2
- stages = [
- ("ensemble", [args.python, str(ensemble_script)]),
- ("bayesian", [args.python, str(bayesian_script)]),
- ]
- if args.dry_run:
- print("Dry run: no training scripts will be executed.")
- print(f"Working directory: {workdir}")
- print(f"Run log directory: {run_log_dir}")
- for stage_name, command in stages:
- print(f"Stage {stage_name}: {' '.join(command)}")
- return 0
- run_log_dir.mkdir(parents=True, exist_ok=True)
- summary_path = run_log_dir / "run_summary.json"
- run_started = utc_now_iso()
- run_start_time = time.monotonic()
- stage_results: list[dict[str, object]] = []
- final_exit_code = 0
- for stage_name, command in stages:
- print(f"\n=== Starting {stage_name} training ===")
- log_file = run_log_dir / f"{stage_name}.log"
- result = run_stage(stage_name, command, workdir, log_file)
- stage_results.append(result)
- if int(result["exit_code"]) != 0 and not args.continue_on_error:
- final_exit_code = int(result["exit_code"])
- print(
- (
- f"Stage '{stage_name}' failed with exit code {result['exit_code']}. "
- "Stopping because --continue-on-error was not set."
- )
- )
- break
- if final_exit_code == 0:
- failed = [r for r in stage_results if int(r["exit_code"]) != 0]
- if failed:
- final_exit_code = int(failed[-1]["exit_code"])
- run_finished = utc_now_iso()
- total_duration = time.monotonic() - run_start_time
- summary = {
- "run_started_at_utc": run_started,
- "run_finished_at_utc": run_finished,
- "total_duration_seconds": total_duration,
- "workdir": str(workdir),
- "python_executable": args.python,
- "continue_on_error": args.continue_on_error,
- "final_exit_code": final_exit_code,
- "overall_status": "success" if final_exit_code == 0 else "failed",
- "stages": stage_results,
- }
- with open(summary_path, "w", encoding="utf-8") as f:
- json.dump(summary, f, indent=2)
- print("\n=== Overnight run complete ===")
- print(f"Summary: {summary_path}")
- print(f"Logs directory: {run_log_dir}")
- return final_exit_code
- if __name__ == "__main__":
- raise SystemExit(main())
|