1 miesiąc temu · 39b29402b7
--- a/prentice_analysis.py
+++ b/prentice_analysis.py
@@ -1,118 +1,164 @@
 
				-import numpy as np

			
 
				-import pandas as pd

			
 
				-from scipy.io import loadmat

			
 
				-import statsmodels.api as sm

			
 
				-

			
 
				-

			
 
				-def load_prentice_mat(flags_path, suv_path):

			
 
				-    flags_mat = loadmat(flags_path)

			
 
				-    suv_mat = loadmat(suv_path)

			
 
				-

			
 
				-    patients_raw = flags_mat["patients"]

			
 
				-    patient_ids = [str(patients_raw[i, 0].squeeze()) for i in range(patients_raw.shape[0])]

			
 
				-

			
 
				-    flags = flags_mat["flags"]

			
 
				-    days = flags_mat["days"]

			
 
				-    lung = suv_mat["lung_SUVperc_COMBINED"]

			
 
				-

			
 
				-    return {

			
 
				-        "patient_ids": patient_ids,

			
 
				-        "flags": flags,

			
 
				-        "days": days,

			
 
				-        "lung": lung,

			
 
				-    }

			
 
				-

			
 
				-

			
 
				-def extract_lung_percentile_max(lung_array, percentile):

			
 
				-    idx = int(percentile) - 1

			
 
				-    x = lung_array[:, :, idx]

			
 
				-    x = np.where(np.isnan(x), -np.inf, x)

			
 
				-    x_max = np.max(x, axis=1)

			
 
				-    x_max[np.isneginf(x_max)] = np.nan

			
 
				-    return x_max

			
 
				-

			
 
				-

			
 
				-def build_analysis_df_from_mat(data, treatment_col, ae_col, percentiles):

			
 
				-    df = pd.DataFrame({

			
 
				-        "Patient_ID": data["patient_ids"],

			
 
				-        "Treatment_raw": data["flags"][:, treatment_col],

			
 
				-        "AE_raw": data["flags"][:, ae_col],

			
 
				-    })

			
 
				-

			
 
				-    for p in percentiles:

			
 
				-        df[f"Lung_SUV_{p}_max"] = extract_lung_percentile_max(data["lung"], p)

			
 
				-

			
 
				-    return df

			
 
				-

			
 
				-

			
 
				-def collapse_treatment_to_two_groups(series, mapping):

			
 
				-    return series.map(mapping)

			
 
				-

			
 
				-

			
 
				-def add_constant_safe(x):

			
 
				-    return sm.add_constant(x, has_constant="add")

			
 
				-

			
 
				-

			
 
				-def fit_logit(y, x):

			
 
				-    return sm.Logit(y, add_constant_safe(x)).fit(disp=0)

			
 
				-

			
 
				-

			
 
				-def fit_ols(y, x):

			
 
				-    return sm.OLS(y, add_constant_safe(x)).fit()

			
 
				-

			
 
				-

			
 
				-def null_logit_llf(y):

			
 
				-    return sm.Logit(y, np.ones((len(y), 1))).fit(disp=0).llf

			
 
				-

			
 
				-

			
 
				-def run_prentice_single(df, biomarker_col, treatment_col="Treatment", ae_col="AE", alpha=0.05):

			
 
				-    dfx = df[[treatment_col, ae_col, biomarker_col]].dropna().copy()

			
 
				-

			
 
				-    Z = dfx[treatment_col].astype(float).values

			
 
				-    T = dfx[ae_col].astype(float).values

			
 
				-    S = dfx[biomarker_col].astype(float).values

			
 
				-

			
 
				-    m1 = fit_logit(T, Z)

			
 
				-    m2 = fit_ols(S, Z)

			
 
				-    m3 = fit_logit(T, S)

			
 
				-    m4 = fit_logit(T, pd.DataFrame({"Z": Z, "S": S}))

			
 
				-

			
 
				-    ll_aa = m1.llf

			
 
				-    ll_ab = m4.llf

			
 
				-    ll_bb = null_logit_llf(T)

			
 
				-    pe = (ll_aa - ll_ab) / (ll_aa - ll_bb) if (ll_aa - ll_bb) != 0 else np.nan

			
 
				-

			
 
				-    p_aa = m1.predict(add_constant_safe(Z))

			
 
				-    p_ab = m4.predict(add_constant_safe(pd.DataFrame({"Z": Z, "S": S})))

			
 
				-    p_bb = np.repeat(np.mean(T), len(T))

			
 
				-

			
 
				-    aa = np.mean((T - p_aa) ** 2)

			
 
				-    ab = np.mean((T - p_ab) ** 2)

			
 
				-    bb = np.mean((T - p_bb) ** 2)

			
 
				-    pe_risk = (aa - ab) / (aa - bb) if (aa - bb) != 0 else np.nan

			
 
				-

			
 
				-    return {

			
 
				-        "biomarker": biomarker_col,

			
 
				-        "C1_p": m1.pvalues[1],

			
 
				-        "C2_p": m2.pvalues[1],

			
 
				-        "C3_p": m3.pvalues[1],

			
 
				-        "C4_Z_p": m4.pvalues[1],

			
 
				-        "C4_S_p": m4.pvalues[2],

			
 
				-        "PE": pe,

			
 
				-        "PE_risk": pe_risk,

			
 
				-        "C1": m1.pvalues[1] < alpha,

			
 
				-        "C2": m2.pvalues[1] < alpha,

			
 
				-        "C3": m3.pvalues[1] < alpha,

			
 
				-        "C4": (m4.pvalues[2] < alpha) and (m4.pvalues[1] > alpha),

			
 
				-    }

			
 
				-

			
 
				-

			
 
				-def run_prentice_multiple(df, biomarker_cols, treatment_col="Treatment", ae_col="AE", alpha=0.05):

			
 
				-    rows = []

			
 
				-    for col in biomarker_cols:

			
 
				-        try:

			
 
				-            out = run_prentice_single(df, col, treatment_col=treatment_col, ae_col=ae_col, alpha=alpha)

			
 
				-        except Exception as e:

			
 
				-            out = {"biomarker": col, "error": str(e)}

			
 
				-        rows.append(out)

			
 
				-    return pd.DataFrame(rows)
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import plotly.graph_objects as go
			
 
				+from plotly.subplots import make_subplots
			
 
				+import statsmodels.api as sm
			
 
				+from sklearn.metrics import roc_auc_score
			
 
				+from scipy.io import loadmat
			
 
				+
			
 
				+# ============================================================
			
 
				+# CONFIGURATION SECTION (Customize Data Paths and Column Names)
			
 
				+# ============================================================
			
 
				+# Replace these with your actual data file paths
			
 
				+suv_data_path = 'path_to_suv_data.xlsx'  # Update with the path to your SUV data (Excel)
			
 
				+flags_data_path = 'path_to_flags_data.mat'  # Update with the path to your AE flags data (MAT)
			
 
				+
			
 
				+# Column names for your dataset (Modify based on your data)
			
 
				+treatment_column = 'Treatment'  # Treatment type (Monotherapy, Combo)
			
 
				+suv_column = 'SUV_percentile'  # SUV percentiles column
			
 
				+ae_flag_column = 'Lung_AE_Flag'  # AE Flag column (0 or 1)
			
 
				+percentile_column = 'percentile'  # Percentile column (5, 10, 15, ..., 95)
			
 
				+
			
 
				+# ============================================================
			
 
				+# DATA IMPORT SECTION: Load Data
			
 
				+# ============================================================
			
 
				+# Load SUV data (Adjust this if you are using another format, like CSV)
			
 
				+suv_data = pd.read_excel(suv_data_path)
			
 
				+
			
 
				+# Load AE flags data from MAT file
			
 
				+flags_data = loadmat(flags_data_path)
			
 
				+ae_flags = flags_data['Lung_AE_Flag']  # Ensure this is the correct key in your MAT file
			
 
				+
			
 
				+# ============================================================
			
 
				+# Step 1: Treatment Effect (C1) - β_Z (Logistic Regression)
			
 
				+# ============================================================
			
 
				+def calculate_treatment_effect(data, treatment_column, ae_flag_column):
			
 
				+    """
			
 
				+    Calculate Treatment Effect (C1) using logistic regression.
			
 
				+    """
			
 
				+    X = sm.add_constant(data[treatment_column])  # Add intercept term
			
 
				+    y = data[ae_flag_column]  # Response variable (AE flag)
			
 
				+    model = sm.Logit(y, X).fit(disp=False)  # Logistic regression
			
 
				+    beta_Z = model.params[treatment_column]  # Treatment effect (coefficient)
			
 
				+    p_value_C1 = model.pvalues[treatment_column]  # p-value for treatment effect
			
 
				+    return beta_Z, p_value_C1
			
 
				+
			
 
				+# Example usage for Treatment Effect (C1)
			
 
				+beta_Z, p_value_C1 = calculate_treatment_effect(suv_data, treatment_column, ae_flag_column)
			
 
				+
			
 
				+# ============================================================
			
 
				+# Step 2: Treatment Effect on Biomarker (C2) - α_Z
			
 
				+# ============================================================
			
 
				+def calculate_treatment_on_biomarker(data, treatment_column, suv_column):
			
 
				+    """
			
 
				+    Calculate Treatment Effect on Biomarker (C2) using linear regression.
			
 
				+    """
			
 
				+    X = sm.add_constant(data[treatment_column])  # Add intercept term
			
 
				+    y = data[suv_column]  # SUV values as response
			
 
				+    model = sm.OLS(y, X).fit(disp=False)  # OLS regression
			
 
				+    alpha_Z = model.params[treatment_column]  # Treatment effect on the biomarker
			
 
				+    p_value_C2 = model.pvalues[treatment_column]  # p-value for treatment effect on biomarker
			
 
				+    return alpha_Z, p_value_C2
			
 
				+
			
 
				+# Example usage for Treatment Effect on Biomarker (C2)
			
 
				+alpha_Z, p_value_C2 = calculate_treatment_on_biomarker(suv_data, treatment_column, suv_column)
			
 
				+
			
 
				+# ============================================================
			
 
				+# Step 3: Surrogate Effect (C3) - η_S
			
 
				+# ============================================================
			
 
				+def calculate_surrogate_effect(data, suv_column, ae_flag_column):
			
 
				+    """
			
 
				+    Calculate the Surrogate Effect (C3) using logistic regression.
			
 
				+    """
			
 
				+    X = sm.add_constant(data[suv_column])  # Add intercept term
			
 
				+    y = data[ae_flag_column]  # AE flag as response
			
 
				+    model = sm.Logit(y, X).fit(disp=False)  # Logistic regression
			
 
				+    eta_S = model.params[suv_column]  # Surrogate effect (coefficient)
			
 
				+    p_value_C3 = model.pvalues[suv_column]  # p-value for surrogate effect
			
 
				+    return eta_S, p_value_C3
			
 
				+
			
 
				+# Example usage for Surrogate Effect (C3)
			
 
				+eta_S, p_value_C3 = calculate_surrogate_effect(suv_data, suv_column, ae_flag_column)
			
 
				+
			
 
				+# ============================================================
			
 
				+# Step 4: AUC Calculation (C3)
			
 
				+# ============================================================
			
 
				+def calculate_auc(data, suv_column, ae_flag_column):
			
 
				+    """
			
 
				+    Calculate AUC for Criterion 3 (C3).
			
 
				+    """
			
 
				+    auc = roc_auc_score(data[ae_flag_column], data[suv_column])  # AUC calculation
			
 
				+    return auc
			
 
				+
			
 
				+# Example usage for AUC (C3)
			
 
				+auc_C3 = calculate_auc(suv_data, suv_column, ae_flag_column)
			
 
				+
			
 
				+# ============================================================
			
 
				+# Step 5: PE Risk Calculation (C4)
			
 
				+# ============================================================
			
 
				+def calculate_pe_risk(data, treatment_column, suv_column, ae_flag_column):
			
 
				+    """
			
 
				+    Calculate PE Risk for Criterion 4.
			
 
				+    """
			
 
				+    X = sm.add_constant(data[suv_column])  # Add intercept term
			
 
				+    y = data[ae_flag_column]  # AE flag as response
			
 
				+    model = sm.Logit(y, X).fit(disp=False)  # Logistic regression
			
 
				+    PE_risk = model.params[suv_column]  # PE Risk calculation (coefficient)
			
 
				+    return PE_risk
			
 
				+
			
 
				+# Example usage for PE Risk (C4)
			
 
				+PE_risk_C4 = calculate_pe_risk(suv_data, treatment_column, suv_column, ae_flag_column)
			
 
				+
			
 
				+# ============================================================
			
 
				+# Step 6: Creating a Clean Table with All Results
			
 
				+# ============================================================
			
 
				+# Combine the results into a DataFrame for easy visualization
			
 
				+results = {
			
 
				+    "Percentile": suv_data[percentile_column],
			
 
				+    "Treatment Effect (C1)": [beta_Z] * len(suv_data),
			
 
				+    "p-value (C1)": [p_value_C1] * len(suv_data),
			
 
				+    "Treatment Effect (C2)": [alpha_Z] * len(suv_data),
			
 
				+    "p-value (C2)": [p_value_C2] * len(suv_data),
			
 
				+    "Surrogate Effect (C3)": [eta_S] * len(suv_data),
			
 
				+    "p-value (C3)": [p_value_C3] * len(suv_data),
			
 
				+    "AUC (C3)": [auc_C3] * len(suv_data),
			
 
				+    "PE Risk (C4)": [PE_risk_C4] * len(suv_data)
			
 
				+}
			
 
				+
			
 
				+results_df = pd.DataFrame(results)
			
 
				+
			
 
				+# ============================================================
			
 
				+# Step 7: Visualize Results using Plotly (Interactive Plot)
			
 
				+# ============================================================
			
 
				+def create_interactive_plot():
			
 
				+    """
			
 
				+    Create an interactive plot to visualize Treatment Effects, Surrogate Effects, AUC, and PE Risk across percentiles.
			
 
				+    """
			
 
				+    # Create a figure with subplots
			
 
				+    fig = make_subplots(rows=1, cols=1, subplot_titles=("Treatment Effects and Metrics"))
			
 
				+
			
 
				+    # Plot the results
			
 
				+    fig.add_trace(go.Scatter(x=suv_data[percentile_column], y=alpha_Z, mode="lines+markers", name="Treatment Effect on Biomarker (C2)"))
			
 
				+    fig.add_trace(go.Scatter(x=suv_data[percentile_column], y=eta_S, mode="lines+markers", name="Surrogate Effect (C3)"))
			
 
				+    fig.add_trace(go.Scatter(x=suv_data[percentile_column], y=auc_C3, mode="lines+markers", name="AUC (C3)"))
			
 
				+    fig.add_trace(go.Scatter(x=suv_data[percentile_column], y=PE_risk_C4, mode="lines+markers", name="PE Risk (C4)"))
			
 
				+
			
 
				+    # Update layout
			
 
				+    fig.update_layout(
			
 
				+        title="Comparison of Treatment Effects, Surrogate Effects, AUC, and PE Risk Across Percentiles",
			
 
				+        xaxis_title="SUV Percentile",
			
 
				+        yaxis_title="Metric Value",
			
 
				+        width=1000,
			
 
				+        height=600
			
 
				+    )
			
 
				+    
			
 
				+    fig.show()
			
 
				+
			
 
				+# Call the function to visualize the plot
			
 
				+create_interactive_plot()
			
 
				+
			
 
				+# ============================================================
			
 
				+# Export Results to CSV and Excel
			
 
				+# ============================================================
			
 
				+results_df.to_csv('results_table.csv', index=False)  # Save table as CSV
			
 
				+results_df.to_excel('results_table.xlsx', index=False)  # Save table as Excel