2 місяців тому · 877e854228
--- a/prentice_criteria.ipynb
+++ b/prentice_criteria.ipynb
--- a/src/utils.py
+++ b/src/utils.py
@@ -13,7 +13,7 @@ def read_data(filename, cols_in, cols_out, logscale = False):
 
				     
			
 
				     return df_data, df_work
			
 
				 
			
 
				-def tz_analysis(
			
 
				+def binary_independence_analysis(
			
 
				     df: pd.DataFrame,
			
 
				     t_col: str = "T",
			
 
				     z_col: str = "Z",
			
@@ -91,4 +91,169 @@ def tz_analysis(
 
				                 "reject_alpha": bool(p_fisher < alpha),
			
 
				             },
			
 
				         },
			
 
				-    }
			
 
				+    }
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import statsmodels.formula.api as smf
			
 
				+from typing import Literal, Dict, Any, Tuple, Union, Optional
			
 
				+
			
 
				+from scipy.stats import (
			
 
				+    fisher_exact,
			
 
				+    chi2_contingency, 
			
 
				+    ttest_ind,
			
 
				+    mannwhitneyu,
			
 
				+    ks_2samp,
			
 
				+    permutation_test
			
 
				+)
			
 
				+
			
 
				+Criterion1Test = Literal["logit", "fisher", "chi2"]
			
 
				+Criterion2Test = Literal["ols", "ols_robust", "welch", "mannwhitney", "ks", "permutation_mean"]
			
 
				+
			
 
				+def prentice_criteria(
			
 
				+    data: pd.DataFrame,
			
 
				+    endpoint: str = "T",
			
 
				+    surrogate: str = "S",
			
 
				+    treatment: str = "Z",
			
 
				+    alpha: float = 0.05,
			
 
				+    criterion1_test: Criterion1Test = "logit",
			
 
				+    criterion2_test: Criterion2Test = "ols_robust",
			
 
				+    chi2_yates: bool = False,
			
 
				+    n_resamples: int = 10000,
			
 
				+    random_state: int = 123,
			
 
				+    return_models: bool = False
			
 
				+) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict[str, Any]]]:
			
 
				+    
			
 
				+    df = data.copy()
			
 
				+    df = df[[endpoint, surrogate, treatment]].dropna().copy()
			
 
				+    
			
 
				+    f1 = f"{endpoint} ~ {treatment}"
			
 
				+    f2 = f"{surrogate} ~ {treatment}"
			
 
				+    f3 = f"{endpoint} ~ {surrogate}"
			
 
				+    f4 = f"{endpoint} ~ {surrogate} + {treatment}"
			
 
				+
			
 
				+    # =========================================================
			
 
				+    # Criterion 1: Treatment effect on True Endpoint
			
 
				+    # =========================================================
			
 
				+    model1 = smf.logit(f1, data=df).fit(disp=0)
			
 
				+    aux: Dict[str, Any] = {"criterion1_logit": model1}
			
 
				+
			
 
				+    if criterion1_test in ["fisher", "chi2"]:
			
 
				+        table1 = pd.crosstab(df[treatment], df[endpoint])
			
 
				+        if table1.shape != (2, 2):
			
 
				+            raise ValueError(f"'{criterion1_test}' requires binary {treatment}/{endpoint}.")
			
 
				+        
			
 
				+        if criterion1_test == "fisher":
			
 
				+            _, p1 = fisher_exact(table1)
			
 
				+            method1 = "Fisher's exact test"
			
 
				+        else:  # chi2
			
 
				+            chi2_stat, p1, dof, expected = chi2_contingency(table1, correction=chi2_yates)
			
 
				+            method1 = "Pearson's Chi-squared test"
			
 
				+            aux["criterion1_chi2_stats"] = {"stat": chi2_stat, "dof": dof, "expected": expected}
			
 
				+            
			
 
				+        model1_label = f"{treatment} x {endpoint} (2x2 table)"
			
 
				+        aux["criterion1_contingency_table"] = table1
			
 
				+
			
 
				+    elif criterion1_test == "logit":
			
 
				+        p1 = model1.pvalues[treatment]
			
 
				+        method1 = "Logistic regression"
			
 
				+        model1_label = f1
			
 
				+    else:
			
 
				+        raise ValueError("Invalid criterion1_test")
			
 
				+
			
 
				+    pass1 = bool(p1 < alpha)
			
 
				+
			
 
				+    # =========================================================
			
 
				+    # Criterion 2: Treatment effect on Surrogate
			
 
				+    # =========================================================
			
 
				+    s0 = df.loc[df[treatment] == 0, surrogate].dropna()
			
 
				+    s1 = df.loc[df[treatment] == 1, surrogate].dropna()
			
 
				+
			
 
				+    unique_treat = sorted(df[treatment].dropna().unique())
			
 
				+    if criterion2_test in {"welch", "mannwhitney", "ks", "permutation_mean"}:
			
 
				+        if len(unique_treat) != 2 or set(unique_treat) != {0, 1}:
			
 
				+            raise ValueError(f"'{criterion2_test}' requires binary 0/1 treatment.")
			
 
				+
			
 
				+    model2 = None
			
 
				+    effect2 = np.nan
			
 
				+
			
 
				+    if criterion2_test == "ols":
			
 
				+        model2 = smf.ols(f2, data=df).fit()
			
 
				+        p2, effect2, method2 = model2.pvalues[treatment], model2.params[treatment], "OLS"
			
 
				+        model2_label = f2
			
 
				+    elif criterion2_test == "ols_robust":
			
 
				+        model2 = smf.ols(f2, data=df).fit(cov_type="HC3")
			
 
				+        p2, effect2, method2 = model2.pvalues[treatment], model2.params[treatment], "OLS (HC3 robust SE)"
			
 
				+        model2_label = f2
			
 
				+    elif criterion2_test == "welch":
			
 
				+        stat2, p2 = ttest_ind(s1, s0, equal_var=False, nan_policy="omit")
			
 
				+        effect2, method2 = s1.mean() - s0.mean(), "Welch two-sample t-test"
			
 
				+        model2_label = f"{surrogate} by {treatment} groups"
			
 
				+        aux["criterion2_statistic"] = stat2
			
 
				+    elif criterion2_test == "mannwhitney":
			
 
				+        stat2, p2 = mannwhitneyu(s1, s0, alternative="two-sided")
			
 
				+        effect2, method2 = s1.median() - s0.median(), "Mann–Whitney U test"
			
 
				+        model2_label = f"{surrogate} by {treatment} groups"
			
 
				+        aux["criterion2_statistic"] = stat2
			
 
				+    elif criterion2_test == "ks":
			
 
				+        stat2, p2 = ks_2samp(s1, s0, alternative="two-sided")
			
 
				+        effect2, method2 = stat2, "Kolmogorov–Smirnov test"
			
 
				+        model2_label = f"{surrogate} by {treatment} groups"
			
 
				+        aux["criterion2_statistic"] = stat2
			
 
				+    elif criterion2_test == "permutation_mean":
			
 
				+        def mean_diff(x, y, axis=0): return np.mean(x, axis=axis) - np.mean(y, axis=axis)
			
 
				+        perm_res = permutation_test(
			
 
				+            (s1.to_numpy(), s0.to_numpy()), statistic=mean_diff,
			
 
				+            permutation_type="independent", alternative="two-sided",
			
 
				+            n_resamples=n_resamples, random_state=random_state
			
 
				+        )
			
 
				+        p2, effect2, method2 = perm_res.pvalue, s1.mean() - s0.mean(), "Permutation test"
			
 
				+        model2_label = f"{surrogate} by {treatment} groups"
			
 
				+        aux["criterion2_permutation_result"] = perm_res
			
 
				+
			
 
				+    if model2 is not None:
			
 
				+        aux["criterion2_model"] = model2
			
 
				+    pass2 = bool(p2 < alpha)
			
 
				+
			
 
				+    # =========================================================
			
 
				+    # Criterion 3 & 4
			
 
				+    # =========================================================
			
 
				+    model3 = smf.logit(f3, data=df).fit(disp=0)
			
 
				+    p3 = model3.pvalues[surrogate]
			
 
				+    pass3 = bool(p3 < alpha)
			
 
				+    aux["criterion3"] = model3
			
 
				+
			
 
				+    model4 = smf.logit(f4, data=df).fit(disp=0)
			
 
				+    p4_z, p4_s = model4.pvalues[treatment], model4.pvalues[surrogate]
			
 
				+    pass4 = bool((p4_z > alpha) and (p4_s < alpha))
			
 
				+    aux["criterion4"] = model4
			
 
				+
			
 
				+    # PE calculation
			
 
				+    beta_unadj = model1.params[treatment]
			
 
				+    beta_adj = model4.params[treatment]
			
 
				+    pe = (beta_unadj - beta_adj) / beta_unadj if beta_unadj != 0 else np.nan
			
 
				+
			
 
				+    # --- Construct Results ---
			
 
				+    results_list = [
			
 
				+        {"Criterion": "1. Treatment -> True", "Method": method1, "Model": model1_label, "Estimate": model1.params[treatment] if treatment in model1.params else np.nan, "P-value": p1, "Pass": pass1},
			
 
				+        {"Criterion": "2. Treatment -> Surrogate", "Method": method2, "Model": model2_label, "Estimate": effect2, "P-value": p2, "Pass": pass2},
			
 
				+        {"Criterion": "3. Surrogate -> True", "Method": "Logistic", "Model": f3, "Estimate": model3.params[surrogate], "P-value": p3, "Pass": pass3},
			
 
				+        {"Criterion": "4. Full Mediation", "Method": "Logistic", "Model": f4, "Estimate": model4.params[treatment], "P-value": p4_z, "Pass": pass4},
			
 
				+        
			
 
				+        # --- Proportion Explained (still uses logistic coefficients) ---
			
 
				+        {
			
 
				+        "Criterion": "Proportion Explained (PE)",
			
 
				+        "Method": "Logistic coefficients",
			
 
				+        "Model": f"Compare {treatment} in ({f1}) vs ({f4}).",
			
 
				+        "Estimate": pe,
			
 
				+        "P-value": pd.NA,
			
 
				+        "Pass": (0.5 <= pe <= 0.75)
			
 
				+        }
			
 
				+    ]
			
 
				+    
			
 
				+    results_df = pd.DataFrame(results_list)
			
 
				+    results_df["Pass"] = results_df["Pass"].astype("boolean")
			
 
				+
			
 
				+    if return_models:
			
 
				+        return results_df, aux
			
 
				+    return results_df