from __future__ import annotations import argparse import json import os import pathlib as pl import subprocess import sys import time from datetime import datetime, timezone def utc_now_iso() -> str: return datetime.now(timezone.utc).isoformat() def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description=( "Run ensemble training followed by bayesian training using existing scripts." ) ) parser.add_argument( "--python", default=sys.executable, help="Python executable to use for child scripts (default: current interpreter).", ) parser.add_argument( "--workdir", default=str(pl.Path(__file__).resolve().parent), help="Working directory for training scripts (default: this script directory).", ) parser.add_argument( "--log-dir", default=None, help="Directory for log and summary files (default: /logs).", ) parser.add_argument( "--continue-on-error", action="store_true", help="Continue to the next stage even if a stage fails.", ) parser.add_argument( "--dry-run", action="store_true", help="Print resolved commands and paths without running training.", ) return parser.parse_args() def run_stage( stage_name: str, command: list[str], workdir: pl.Path, log_file: pl.Path, ) -> dict[str, object]: started = utc_now_iso() start_time = time.monotonic() log_file.parent.mkdir(parents=True, exist_ok=True) with open(log_file, "w", encoding="utf-8") as log: log.write(f"[{started}] Starting stage: {stage_name}\n") log.write(f"Command: {' '.join(command)}\n") log.write(f"Working directory: {workdir}\n\n") process = subprocess.Popen( command, cwd=str(workdir), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True, env=os.environ.copy(), ) if process.stdout is not None: for line in process.stdout: print(line, end="") log.write(line) return_code = process.wait() finished = utc_now_iso() duration_seconds = time.monotonic() - start_time log.write( ( "\n" f"[{finished}] Finished stage: {stage_name}\n" f"Exit code: {return_code}\n" f"Duration seconds: {duration_seconds:.2f}\n" ) ) return { "stage": stage_name, "command": command, "started_at_utc": started, "finished_at_utc": finished, "duration_seconds": duration_seconds, "exit_code": return_code, "status": "success" if return_code == 0 else "failed", "log_file": str(log_file), } def main() -> int: args = parse_args() workdir = pl.Path(args.workdir).resolve() log_dir = ( pl.Path(args.log_dir).resolve() if args.log_dir is not None else workdir / "logs" ) run_stamp = datetime.now().strftime("%Y%m%d_%H%M%S") run_log_dir = log_dir / f"overnight_{run_stamp}" ensemble_script = workdir / "train_ensemble.py" bayesian_script = workdir / "train_bayesian.py" for script in (ensemble_script, bayesian_script): if not script.exists(): print(f"Error: required script not found: {script}") return 2 stages = [ ("ensemble", [args.python, str(ensemble_script)]), ("bayesian", [args.python, str(bayesian_script)]), ] if args.dry_run: print("Dry run: no training scripts will be executed.") print(f"Working directory: {workdir}") print(f"Run log directory: {run_log_dir}") for stage_name, command in stages: print(f"Stage {stage_name}: {' '.join(command)}") return 0 run_log_dir.mkdir(parents=True, exist_ok=True) summary_path = run_log_dir / "run_summary.json" run_started = utc_now_iso() run_start_time = time.monotonic() stage_results: list[dict[str, object]] = [] final_exit_code = 0 for stage_name, command in stages: print(f"\n=== Starting {stage_name} training ===") log_file = run_log_dir / f"{stage_name}.log" result = run_stage(stage_name, command, workdir, log_file) stage_results.append(result) if int(result["exit_code"]) != 0 and not args.continue_on_error: final_exit_code = int(result["exit_code"]) print( ( f"Stage '{stage_name}' failed with exit code {result['exit_code']}. " "Stopping because --continue-on-error was not set." ) ) break if final_exit_code == 0: failed = [r for r in stage_results if int(r["exit_code"]) != 0] if failed: final_exit_code = int(failed[-1]["exit_code"]) run_finished = utc_now_iso() total_duration = time.monotonic() - run_start_time summary = { "run_started_at_utc": run_started, "run_finished_at_utc": run_finished, "total_duration_seconds": total_duration, "workdir": str(workdir), "python_executable": args.python, "continue_on_error": args.continue_on_error, "final_exit_code": final_exit_code, "overall_status": "success" if final_exit_code == 0 else "failed", "stages": stage_results, } with open(summary_path, "w", encoding="utf-8") as f: json.dump(summary, f, indent=2) print("\n=== Overnight run complete ===") print(f"Summary: {summary_path}") print(f"Logs directory: {run_log_dir}") return final_exit_code if __name__ == "__main__": raise SystemExit(main())