run_overnight_training.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. from __future__ import annotations
  2. import argparse
  3. import json
  4. import os
  5. import pathlib as pl
  6. import subprocess
  7. import sys
  8. import time
  9. from datetime import datetime, timezone
  10. def utc_now_iso() -> str:
  11. return datetime.now(timezone.utc).isoformat()
  12. def parse_args() -> argparse.Namespace:
  13. parser = argparse.ArgumentParser(
  14. description=(
  15. "Run ensemble training followed by bayesian training using existing scripts."
  16. )
  17. )
  18. parser.add_argument(
  19. "--python",
  20. default=sys.executable,
  21. help="Python executable to use for child scripts (default: current interpreter).",
  22. )
  23. parser.add_argument(
  24. "--workdir",
  25. default=str(pl.Path(__file__).resolve().parent),
  26. help="Working directory for training scripts (default: this script directory).",
  27. )
  28. parser.add_argument(
  29. "--log-dir",
  30. default=None,
  31. help="Directory for log and summary files (default: <workdir>/logs).",
  32. )
  33. parser.add_argument(
  34. "--continue-on-error",
  35. action="store_true",
  36. help="Continue to the next stage even if a stage fails.",
  37. )
  38. parser.add_argument(
  39. "--dry-run",
  40. action="store_true",
  41. help="Print resolved commands and paths without running training.",
  42. )
  43. return parser.parse_args()
  44. def run_stage(
  45. stage_name: str,
  46. command: list[str],
  47. workdir: pl.Path,
  48. log_file: pl.Path,
  49. ) -> dict[str, object]:
  50. started = utc_now_iso()
  51. start_time = time.monotonic()
  52. log_file.parent.mkdir(parents=True, exist_ok=True)
  53. with open(log_file, "w", encoding="utf-8") as log:
  54. log.write(f"[{started}] Starting stage: {stage_name}\n")
  55. log.write(f"Command: {' '.join(command)}\n")
  56. log.write(f"Working directory: {workdir}\n\n")
  57. process = subprocess.Popen(
  58. command,
  59. cwd=str(workdir),
  60. stdout=subprocess.PIPE,
  61. stderr=subprocess.STDOUT,
  62. text=True,
  63. bufsize=1,
  64. universal_newlines=True,
  65. env=os.environ.copy(),
  66. )
  67. if process.stdout is not None:
  68. for line in process.stdout:
  69. print(line, end="")
  70. log.write(line)
  71. return_code = process.wait()
  72. finished = utc_now_iso()
  73. duration_seconds = time.monotonic() - start_time
  74. log.write(
  75. (
  76. "\n"
  77. f"[{finished}] Finished stage: {stage_name}\n"
  78. f"Exit code: {return_code}\n"
  79. f"Duration seconds: {duration_seconds:.2f}\n"
  80. )
  81. )
  82. return {
  83. "stage": stage_name,
  84. "command": command,
  85. "started_at_utc": started,
  86. "finished_at_utc": finished,
  87. "duration_seconds": duration_seconds,
  88. "exit_code": return_code,
  89. "status": "success" if return_code == 0 else "failed",
  90. "log_file": str(log_file),
  91. }
  92. def main() -> int:
  93. args = parse_args()
  94. workdir = pl.Path(args.workdir).resolve()
  95. log_dir = (
  96. pl.Path(args.log_dir).resolve()
  97. if args.log_dir is not None
  98. else workdir / "logs"
  99. )
  100. run_stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  101. run_log_dir = log_dir / f"overnight_{run_stamp}"
  102. ensemble_script = workdir / "train_ensemble.py"
  103. bayesian_script = workdir / "train_bayesian.py"
  104. for script in (ensemble_script, bayesian_script):
  105. if not script.exists():
  106. print(f"Error: required script not found: {script}")
  107. return 2
  108. stages = [
  109. ("ensemble", [args.python, str(ensemble_script)]),
  110. ("bayesian", [args.python, str(bayesian_script)]),
  111. ]
  112. if args.dry_run:
  113. print("Dry run: no training scripts will be executed.")
  114. print(f"Working directory: {workdir}")
  115. print(f"Run log directory: {run_log_dir}")
  116. for stage_name, command in stages:
  117. print(f"Stage {stage_name}: {' '.join(command)}")
  118. return 0
  119. run_log_dir.mkdir(parents=True, exist_ok=True)
  120. summary_path = run_log_dir / "run_summary.json"
  121. run_started = utc_now_iso()
  122. run_start_time = time.monotonic()
  123. stage_results: list[dict[str, object]] = []
  124. final_exit_code = 0
  125. for stage_name, command in stages:
  126. print(f"\n=== Starting {stage_name} training ===")
  127. log_file = run_log_dir / f"{stage_name}.log"
  128. result = run_stage(stage_name, command, workdir, log_file)
  129. stage_results.append(result)
  130. if int(result["exit_code"]) != 0 and not args.continue_on_error:
  131. final_exit_code = int(result["exit_code"])
  132. print(
  133. (
  134. f"Stage '{stage_name}' failed with exit code {result['exit_code']}. "
  135. "Stopping because --continue-on-error was not set."
  136. )
  137. )
  138. break
  139. if final_exit_code == 0:
  140. failed = [r for r in stage_results if int(r["exit_code"]) != 0]
  141. if failed:
  142. final_exit_code = int(failed[-1]["exit_code"])
  143. run_finished = utc_now_iso()
  144. total_duration = time.monotonic() - run_start_time
  145. summary = {
  146. "run_started_at_utc": run_started,
  147. "run_finished_at_utc": run_finished,
  148. "total_duration_seconds": total_duration,
  149. "workdir": str(workdir),
  150. "python_executable": args.python,
  151. "continue_on_error": args.continue_on_error,
  152. "final_exit_code": final_exit_code,
  153. "overall_status": "success" if final_exit_code == 0 else "failed",
  154. "stages": stage_results,
  155. }
  156. with open(summary_path, "w", encoding="utf-8") as f:
  157. json.dump(summary, f, indent=2)
  158. print("\n=== Overnight run complete ===")
  159. print(f"Summary: {summary_path}")
  160. print(f"Logs directory: {run_log_dir}")
  161. return final_exit_code
  162. if __name__ == "__main__":
  163. raise SystemExit(main())