prentice_analysis.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. import numpy as np
  2. import pandas as pd
  3. from scipy.io import loadmat
  4. import statsmodels.api as sm
  5. def load_prentice_mat(flags_path, suv_path):
  6. flags_mat = loadmat(flags_path)
  7. suv_mat = loadmat(suv_path)
  8. patients_raw = flags_mat["patients"]
  9. patient_ids = [str(patients_raw[i, 0].squeeze()) for i in range(patients_raw.shape[0])]
  10. flags = flags_mat["flags"]
  11. days = flags_mat["days"]
  12. lung = suv_mat["lung_SUVperc_COMBINED"]
  13. return {
  14. "patient_ids": patient_ids,
  15. "flags": flags,
  16. "days": days,
  17. "lung": lung,
  18. }
  19. def extract_lung_percentile_max(lung_array, percentile):
  20. idx = int(percentile) - 1
  21. x = lung_array[:, :, idx]
  22. x = np.where(np.isnan(x), -np.inf, x)
  23. x_max = np.max(x, axis=1)
  24. x_max[np.isneginf(x_max)] = np.nan
  25. return x_max
  26. def build_analysis_df_from_mat(data, treatment_col, ae_col, percentiles):
  27. df = pd.DataFrame({
  28. "Patient_ID": data["patient_ids"],
  29. "Treatment_raw": data["flags"][:, treatment_col],
  30. "AE_raw": data["flags"][:, ae_col],
  31. })
  32. for p in percentiles:
  33. df[f"Lung_SUV_{p}_max"] = extract_lung_percentile_max(data["lung"], p)
  34. return df
  35. def collapse_treatment_to_two_groups(series, mapping):
  36. return series.map(mapping)
  37. def add_constant_safe(x):
  38. return sm.add_constant(x, has_constant="add")
  39. def fit_logit(y, x):
  40. return sm.Logit(y, add_constant_safe(x)).fit(disp=0)
  41. def fit_ols(y, x):
  42. return sm.OLS(y, add_constant_safe(x)).fit()
  43. def null_logit_llf(y):
  44. return sm.Logit(y, np.ones((len(y), 1))).fit(disp=0).llf
  45. def run_prentice_single(df, biomarker_col, treatment_col="Treatment", ae_col="AE", alpha=0.05):
  46. dfx = df[[treatment_col, ae_col, biomarker_col]].dropna().copy()
  47. Z = dfx[treatment_col].astype(float).values
  48. T = dfx[ae_col].astype(float).values
  49. S = dfx[biomarker_col].astype(float).values
  50. m1 = fit_logit(T, Z)
  51. m2 = fit_ols(S, Z)
  52. m3 = fit_logit(T, S)
  53. m4 = fit_logit(T, pd.DataFrame({"Z": Z, "S": S}))
  54. ll_aa = m1.llf
  55. ll_ab = m4.llf
  56. ll_bb = null_logit_llf(T)
  57. pe = (ll_aa - ll_ab) / (ll_aa - ll_bb) if (ll_aa - ll_bb) != 0 else np.nan
  58. p_aa = m1.predict(add_constant_safe(Z))
  59. p_ab = m4.predict(add_constant_safe(pd.DataFrame({"Z": Z, "S": S})))
  60. p_bb = np.repeat(np.mean(T), len(T))
  61. aa = np.mean((T - p_aa) ** 2)
  62. ab = np.mean((T - p_ab) ** 2)
  63. bb = np.mean((T - p_bb) ** 2)
  64. pe_risk = (aa - ab) / (aa - bb) if (aa - bb) != 0 else np.nan
  65. return {
  66. "biomarker": biomarker_col,
  67. "C1_p": m1.pvalues[1],
  68. "C2_p": m2.pvalues[1],
  69. "C3_p": m3.pvalues[1],
  70. "C4_Z_p": m4.pvalues[1],
  71. "C4_S_p": m4.pvalues[2],
  72. "PE": pe,
  73. "PE_risk": pe_risk,
  74. "C1": m1.pvalues[1] < alpha,
  75. "C2": m2.pvalues[1] < alpha,
  76. "C3": m3.pvalues[1] < alpha,
  77. "C4": (m4.pvalues[2] < alpha) and (m4.pvalues[1] > alpha),
  78. }
  79. def run_prentice_multiple(df, biomarker_cols, treatment_col="Treatment", ae_col="AE", alpha=0.05):
  80. rows = []
  81. for col in biomarker_cols:
  82. try:
  83. out = run_prentice_single(df, col, treatment_col=treatment_col, ae_col=ae_col, alpha=alpha)
  84. except Exception as e:
  85. out = {"biomarker": col, "error": str(e)}
  86. rows.append(out)
  87. return pd.DataFrame(rows)