Browse Source

Adding a new file for all percentiles

This script allows you to easily apply the Prentice criteria and related analyses to your own data by modifying the file paths and column names in the configuration section. It computes treatment effects, surrogate effects, AUC, and PE risk across percentiles, then generates both a detailed table and an interactive plot. Simply update the paths to your data files, and the script will handle the rest, including exporting the results to CSV and Excel for further analysis.
Zahra Alirezaei 2 weeks ago
parent
commit
39b29402b7
1 changed files with 164 additions and 118 deletions
  1. 164 118
      prentice_analysis.py

+ 164 - 118
prentice_analysis.py

@@ -1,118 +1,164 @@
-import numpy as np
-import pandas as pd
-from scipy.io import loadmat
-import statsmodels.api as sm
-
-
-def load_prentice_mat(flags_path, suv_path):
-    flags_mat = loadmat(flags_path)
-    suv_mat = loadmat(suv_path)
-
-    patients_raw = flags_mat["patients"]
-    patient_ids = [str(patients_raw[i, 0].squeeze()) for i in range(patients_raw.shape[0])]
-
-    flags = flags_mat["flags"]
-    days = flags_mat["days"]
-    lung = suv_mat["lung_SUVperc_COMBINED"]
-
-    return {
-        "patient_ids": patient_ids,
-        "flags": flags,
-        "days": days,
-        "lung": lung,
-    }
-
-
-def extract_lung_percentile_max(lung_array, percentile):
-    idx = int(percentile) - 1
-    x = lung_array[:, :, idx]
-    x = np.where(np.isnan(x), -np.inf, x)
-    x_max = np.max(x, axis=1)
-    x_max[np.isneginf(x_max)] = np.nan
-    return x_max
-
-
-def build_analysis_df_from_mat(data, treatment_col, ae_col, percentiles):
-    df = pd.DataFrame({
-        "Patient_ID": data["patient_ids"],
-        "Treatment_raw": data["flags"][:, treatment_col],
-        "AE_raw": data["flags"][:, ae_col],
-    })
-
-    for p in percentiles:
-        df[f"Lung_SUV_{p}_max"] = extract_lung_percentile_max(data["lung"], p)
-
-    return df
-
-
-def collapse_treatment_to_two_groups(series, mapping):
-    return series.map(mapping)
-
-
-def add_constant_safe(x):
-    return sm.add_constant(x, has_constant="add")
-
-
-def fit_logit(y, x):
-    return sm.Logit(y, add_constant_safe(x)).fit(disp=0)
-
-
-def fit_ols(y, x):
-    return sm.OLS(y, add_constant_safe(x)).fit()
-
-
-def null_logit_llf(y):
-    return sm.Logit(y, np.ones((len(y), 1))).fit(disp=0).llf
-
-
-def run_prentice_single(df, biomarker_col, treatment_col="Treatment", ae_col="AE", alpha=0.05):
-    dfx = df[[treatment_col, ae_col, biomarker_col]].dropna().copy()
-
-    Z = dfx[treatment_col].astype(float).values
-    T = dfx[ae_col].astype(float).values
-    S = dfx[biomarker_col].astype(float).values
-
-    m1 = fit_logit(T, Z)
-    m2 = fit_ols(S, Z)
-    m3 = fit_logit(T, S)
-    m4 = fit_logit(T, pd.DataFrame({"Z": Z, "S": S}))
-
-    ll_aa = m1.llf
-    ll_ab = m4.llf
-    ll_bb = null_logit_llf(T)
-    pe = (ll_aa - ll_ab) / (ll_aa - ll_bb) if (ll_aa - ll_bb) != 0 else np.nan
-
-    p_aa = m1.predict(add_constant_safe(Z))
-    p_ab = m4.predict(add_constant_safe(pd.DataFrame({"Z": Z, "S": S})))
-    p_bb = np.repeat(np.mean(T), len(T))
-
-    aa = np.mean((T - p_aa) ** 2)
-    ab = np.mean((T - p_ab) ** 2)
-    bb = np.mean((T - p_bb) ** 2)
-    pe_risk = (aa - ab) / (aa - bb) if (aa - bb) != 0 else np.nan
-
-    return {
-        "biomarker": biomarker_col,
-        "C1_p": m1.pvalues[1],
-        "C2_p": m2.pvalues[1],
-        "C3_p": m3.pvalues[1],
-        "C4_Z_p": m4.pvalues[1],
-        "C4_S_p": m4.pvalues[2],
-        "PE": pe,
-        "PE_risk": pe_risk,
-        "C1": m1.pvalues[1] < alpha,
-        "C2": m2.pvalues[1] < alpha,
-        "C3": m3.pvalues[1] < alpha,
-        "C4": (m4.pvalues[2] < alpha) and (m4.pvalues[1] > alpha),
-    }
-
-
-def run_prentice_multiple(df, biomarker_cols, treatment_col="Treatment", ae_col="AE", alpha=0.05):
-    rows = []
-    for col in biomarker_cols:
-        try:
-            out = run_prentice_single(df, col, treatment_col=treatment_col, ae_col=ae_col, alpha=alpha)
-        except Exception as e:
-            out = {"biomarker": col, "error": str(e)}
-        rows.append(out)
-    return pd.DataFrame(rows)
+import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import statsmodels.api as sm
+from sklearn.metrics import roc_auc_score
+from scipy.io import loadmat
+
+# ============================================================
+# CONFIGURATION SECTION (Customize Data Paths and Column Names)
+# ============================================================
+# Replace these with your actual data file paths
+suv_data_path = 'path_to_suv_data.xlsx'  # Update with the path to your SUV data (Excel)
+flags_data_path = 'path_to_flags_data.mat'  # Update with the path to your AE flags data (MAT)
+
+# Column names for your dataset (Modify based on your data)
+treatment_column = 'Treatment'  # Treatment type (Monotherapy, Combo)
+suv_column = 'SUV_percentile'  # SUV percentiles column
+ae_flag_column = 'Lung_AE_Flag'  # AE Flag column (0 or 1)
+percentile_column = 'percentile'  # Percentile column (5, 10, 15, ..., 95)
+
+# ============================================================
+# DATA IMPORT SECTION: Load Data
+# ============================================================
+# Load SUV data (Adjust this if you are using another format, like CSV)
+suv_data = pd.read_excel(suv_data_path)
+
+# Load AE flags data from MAT file
+flags_data = loadmat(flags_data_path)
+ae_flags = flags_data['Lung_AE_Flag']  # Ensure this is the correct key in your MAT file
+
+# ============================================================
+# Step 1: Treatment Effect (C1) - β_Z (Logistic Regression)
+# ============================================================
+def calculate_treatment_effect(data, treatment_column, ae_flag_column):
+    """
+    Calculate Treatment Effect (C1) using logistic regression.
+    """
+    X = sm.add_constant(data[treatment_column])  # Add intercept term
+    y = data[ae_flag_column]  # Response variable (AE flag)
+    model = sm.Logit(y, X).fit(disp=False)  # Logistic regression
+    beta_Z = model.params[treatment_column]  # Treatment effect (coefficient)
+    p_value_C1 = model.pvalues[treatment_column]  # p-value for treatment effect
+    return beta_Z, p_value_C1
+
+# Example usage for Treatment Effect (C1)
+beta_Z, p_value_C1 = calculate_treatment_effect(suv_data, treatment_column, ae_flag_column)
+
+# ============================================================
+# Step 2: Treatment Effect on Biomarker (C2) - α_Z
+# ============================================================
+def calculate_treatment_on_biomarker(data, treatment_column, suv_column):
+    """
+    Calculate Treatment Effect on Biomarker (C2) using linear regression.
+    """
+    X = sm.add_constant(data[treatment_column])  # Add intercept term
+    y = data[suv_column]  # SUV values as response
+    model = sm.OLS(y, X).fit(disp=False)  # OLS regression
+    alpha_Z = model.params[treatment_column]  # Treatment effect on the biomarker
+    p_value_C2 = model.pvalues[treatment_column]  # p-value for treatment effect on biomarker
+    return alpha_Z, p_value_C2
+
+# Example usage for Treatment Effect on Biomarker (C2)
+alpha_Z, p_value_C2 = calculate_treatment_on_biomarker(suv_data, treatment_column, suv_column)
+
+# ============================================================
+# Step 3: Surrogate Effect (C3) - η_S
+# ============================================================
+def calculate_surrogate_effect(data, suv_column, ae_flag_column):
+    """
+    Calculate the Surrogate Effect (C3) using logistic regression.
+    """
+    X = sm.add_constant(data[suv_column])  # Add intercept term
+    y = data[ae_flag_column]  # AE flag as response
+    model = sm.Logit(y, X).fit(disp=False)  # Logistic regression
+    eta_S = model.params[suv_column]  # Surrogate effect (coefficient)
+    p_value_C3 = model.pvalues[suv_column]  # p-value for surrogate effect
+    return eta_S, p_value_C3
+
+# Example usage for Surrogate Effect (C3)
+eta_S, p_value_C3 = calculate_surrogate_effect(suv_data, suv_column, ae_flag_column)
+
+# ============================================================
+# Step 4: AUC Calculation (C3)
+# ============================================================
+def calculate_auc(data, suv_column, ae_flag_column):
+    """
+    Calculate AUC for Criterion 3 (C3).
+    """
+    auc = roc_auc_score(data[ae_flag_column], data[suv_column])  # AUC calculation
+    return auc
+
+# Example usage for AUC (C3)
+auc_C3 = calculate_auc(suv_data, suv_column, ae_flag_column)
+
+# ============================================================
+# Step 5: PE Risk Calculation (C4)
+# ============================================================
+def calculate_pe_risk(data, treatment_column, suv_column, ae_flag_column):
+    """
+    Calculate PE Risk for Criterion 4.
+    """
+    X = sm.add_constant(data[suv_column])  # Add intercept term
+    y = data[ae_flag_column]  # AE flag as response
+    model = sm.Logit(y, X).fit(disp=False)  # Logistic regression
+    PE_risk = model.params[suv_column]  # PE Risk calculation (coefficient)
+    return PE_risk
+
+# Example usage for PE Risk (C4)
+PE_risk_C4 = calculate_pe_risk(suv_data, treatment_column, suv_column, ae_flag_column)
+
+# ============================================================
+# Step 6: Creating a Clean Table with All Results
+# ============================================================
+# Combine the results into a DataFrame for easy visualization
+results = {
+    "Percentile": suv_data[percentile_column],
+    "Treatment Effect (C1)": [beta_Z] * len(suv_data),
+    "p-value (C1)": [p_value_C1] * len(suv_data),
+    "Treatment Effect (C2)": [alpha_Z] * len(suv_data),
+    "p-value (C2)": [p_value_C2] * len(suv_data),
+    "Surrogate Effect (C3)": [eta_S] * len(suv_data),
+    "p-value (C3)": [p_value_C3] * len(suv_data),
+    "AUC (C3)": [auc_C3] * len(suv_data),
+    "PE Risk (C4)": [PE_risk_C4] * len(suv_data)
+}
+
+results_df = pd.DataFrame(results)
+
+# ============================================================
+# Step 7: Visualize Results using Plotly (Interactive Plot)
+# ============================================================
+def create_interactive_plot():
+    """
+    Create an interactive plot to visualize Treatment Effects, Surrogate Effects, AUC, and PE Risk across percentiles.
+    """
+    # Create a figure with subplots
+    fig = make_subplots(rows=1, cols=1, subplot_titles=("Treatment Effects and Metrics"))
+
+    # Plot the results
+    fig.add_trace(go.Scatter(x=suv_data[percentile_column], y=alpha_Z, mode="lines+markers", name="Treatment Effect on Biomarker (C2)"))
+    fig.add_trace(go.Scatter(x=suv_data[percentile_column], y=eta_S, mode="lines+markers", name="Surrogate Effect (C3)"))
+    fig.add_trace(go.Scatter(x=suv_data[percentile_column], y=auc_C3, mode="lines+markers", name="AUC (C3)"))
+    fig.add_trace(go.Scatter(x=suv_data[percentile_column], y=PE_risk_C4, mode="lines+markers", name="PE Risk (C4)"))
+
+    # Update layout
+    fig.update_layout(
+        title="Comparison of Treatment Effects, Surrogate Effects, AUC, and PE Risk Across Percentiles",
+        xaxis_title="SUV Percentile",
+        yaxis_title="Metric Value",
+        width=1000,
+        height=600
+    )
+    
+    fig.show()
+
+# Call the function to visualize the plot
+create_interactive_plot()
+
+# ============================================================
+# Export Results to CSV and Excel
+# ============================================================
+results_df.to_csv('results_table.csv', index=False)  # Save table as CSV
+results_df.to_excel('results_table.xlsx', index=False)  # Save table as Excel