2 Commits 997b137f5b ... a2838fc38d

Author SHA1 Message Date
  Martin Horvat a2838fc38d First commit. 1 week ago
  Martin Horvat ac04698cb4 Adding gitignore 2 weeks ago

+ 216 - 0
.gitignore

@@ -0,0 +1,216 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# Redis
+*.rdb
+*.aof
+*.pid
+
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+
+# ActiveMQ
+activemq-data/
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+
+# Streamlit
+.streamlit/secrets.toml

BIN
data/prenticedataa.xlsx


+ 205 - 0
fisher_exact.ipynb

@@ -0,0 +1,205 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "58d00ccd-7950-4367-aec8-2c18aef079d1",
+   "metadata": {},
+   "source": [
+    "# Fisher exact tests\n",
+    "\n",
+    "<br>\n",
+    "Author: Martin Horvat, March 2026\n",
+    "\n",
+    "Ref:\n",
+    "* https://en.wikipedia.org/wiki/Fisher%27s_exact_test\n",
+    "* https://mathworld.wolfram.com/FishersExactTest.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b24f6f9-4bf2-4751-8312-f011afd3422c",
+   "metadata": {},
+   "source": [
+    "## Common"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "362e9a29-2caf-4155-8d90-a591005b4b25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from scipy.special import gammaln\n",
+    "\n",
+    "def _log_table_prob(a, b, c, d):\n",
+    "    r1, r2 = a + b, c + d\n",
+    "    c1, c2 = a + c, b + d\n",
+    "    n = r1 + r2\n",
+    "    return (\n",
+    "        gammaln(r1 + 1) + gammaln(r2 + 1) + gammaln(c1 + 1) + gammaln(c2 + 1)\n",
+    "        - (gammaln(a + 1) + gammaln(b + 1) + gammaln(c + 1) + gammaln(d + 1) + gammaln(n + 1))\n",
+    "    )\n",
+    "\n",
+    "def calculate_table_prob(a, b, c, d):\n",
+    "    return float(np.exp(_log_table_prob(int(a), int(b), int(c), int(d))))\n",
+    "\n",
+    "def arange_at_margins(r, c):\n",
+    "    amin = max(0, r[0] - c[1])\n",
+    "    amax = min(r[0], c[0])\n",
+    "    return np.arange(amin, amax + 1, dtype=np.int64)\n",
+    "\n",
+    "def fisher_p_value(table, eps: float = 1e-12):\n",
+    "    mat = np.asarray(table, dtype=np.int64)\n",
+    "    if mat.shape != (2, 2) or np.any(mat < 0):\n",
+    "        raise ValueError(\"table must be a 2x2 array of nonnegative counts\")\n",
+    "\n",
+    "    a, b, c_, d = map(int, mat.ravel())\n",
+    "    r = mat.sum(axis=1)  # [r1, r2]\n",
+    "    c = mat.sum(axis=0)  # [c1, c2]\n",
+    "\n",
+    "    a_vals = arange_at_margins(r, c)\n",
+    "\n",
+    "    n = int(r[0] + r[1])\n",
+    "    const = (\n",
+    "        gammaln(int(r[0]) + 1) + gammaln(int(r[1]) + 1)\n",
+    "        + gammaln(int(c[0]) + 1) + gammaln(int(c[1]) + 1)\n",
+    "        - gammaln(n + 1)\n",
+    "    )\n",
+    "\n",
+    "    b_vals = r[0] - a_vals\n",
+    "    c_vals = c[0] - a_vals\n",
+    "    d_vals = r[1] - c_vals\n",
+    "\n",
+    "    logp = const - (\n",
+    "        gammaln(a_vals + 1) + gammaln(b_vals + 1)\n",
+    "        + gammaln(c_vals + 1) + gammaln(d_vals + 1)\n",
+    "    )\n",
+    "\n",
+    "    logp_obs = _log_table_prob(a, b, c_, d)\n",
+    "    observed_p = float(np.exp(logp_obs))\n",
+    "\n",
+    "    mask = logp <= (logp_obs + np.log1p(eps))\n",
+    "    two_sided_p = float(np.exp(logp[mask]).sum())\n",
+    "\n",
+    "    return observed_p, two_sided_p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3b5109a3-acf0-4285-8cfd-227074cfb015",
+   "metadata": {},
+   "source": [
+    "## Check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "9d0b1232-ff1a-41da-965e-7f27ed61cd77",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0.01048951048951043, 0.11013986013986005, 0.33041958041957936, 0.3671328671328668, 0.1573426573426565, 0.023601398601398604, 0.0008741258741258679]\n",
+      "0.9999999999999976\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check \n",
+    "table = [[8, 2], [1, 5]]\n",
+    "\n",
+    "# margins\n",
+    "r = np.sum(table, axis = 1)\n",
+    "c = np.sum(table, axis = 0)\n",
+    "\n",
+    "# values of a\n",
+    "avals = arange_mat_margins(r, c)\n",
+    "\n",
+    "probs = [prob_mat_margins(x, r, c) for x in avals]\n",
+    "\n",
+    "print(probs)\n",
+    "print(sum(probs))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "218076bd-89d8-4305-8656-41cde67de1b4",
+   "metadata": {},
+   "source": [
+    "## Example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "5b861593-4786-4b00-bdce-da1710b68af1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Odds Ratio: 20.0000\n",
+      "P-value: 0.0350\n",
+      "\n",
+      "Result is statistically significant (Reject H0)\n",
+      "\n",
+      "Observed Table Probability: 0.0236\n",
+      "Two-Sided P-Value: 0.0350\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 2x2 Contingency Table\n",
+    "#             Success | Failure\n",
+    "# Treatment A:   8    |    2\n",
+    "# Treatment B:   1    |    5\n",
+    "table = [[8, 2], [1, 5]]\n",
+    "\n",
+    "# Perform the test\n",
+    "# returns the odds ratio and the p-value\n",
+    "odds_ratio, p_value = scipy.stats.fisher_exact(table)\n",
+    "\n",
+    "print(f\"Odds Ratio: {odds_ratio:.4f}\")\n",
+    "print(f\"P-value: {p_value:.4f}\")\n",
+    "\n",
+    "# Interpretation\n",
+    "alpha = 0.05\n",
+    "if p_value < alpha:\n",
+    "    print(\"\\nResult is statistically significant (Reject H0)\")\n",
+    "else:\n",
+    "    print(\"\\nResult is not statistically significant (Fail to reject H0)\")\n",
+    "\n",
+    "obs_p, final_p = fisher_p_value(table)\n",
+    "print(f\"\\nObserved Table Probability: {obs_p:.4f}\")\n",
+    "print(f\"Two-Sided P-Value: {final_p:.4f}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

File diff suppressed because it is too large
+ 335 - 0
prentice_criteria.ipynb


BIN
refs/Algorithm643_6497.214326.pdf


BIN
refs/Baker2018.pdf


BIN
refs/Buyse1998.pdf


BIN
refs/Fisher1922.pdf


BIN
refs/Fisher_Exact_Tests_L07.pdf


BIN
refs/Fisher_Exact_Tests_supp21.pdf


BIN
refs/Freedman1992.pdf


BIN
refs/Martin_Notes.jpg


BIN
refs/Prentice1989.pdf


BIN
refs/Raunig2015.pdf


BIN
refs/Robert Presentation_1.pdf


BIN
refs/Wang2002.pdf


+ 94 - 0
src/utils.py

@@ -0,0 +1,94 @@
+import pandas as pd
+import numpy as np
+from scipy.stats import chi2_contingency, fisher_exact
+
+def read_data(filename, cols_in, cols_out, logscale = False):
+    
+    df_data = pd.read_excel(filename)
+
+    df_work = df_data[cols_in].rename(columns=dict(zip(cols_in, cols_out)))
+    df_work['Z'] = df_work['Z'].map({'PEMBRO': 0, 'IPI+NIVO': 1, 'IPI': 0, 'NIVO': 0})
+    
+    if logscale: df_work['S']  = np.log(df_work['S'])
+    
+    return df_data, df_work
+
+def tz_analysis(
+    df: pd.DataFrame,
+    t_col: str = "T",
+    z_col: str = "Z",
+    alpha: float = 0.05,
+    chi2_yates: bool = False
+) -> dict:
+    
+    # Drop missing and enforce 0/1
+    d = df[[t_col, z_col]].dropna()
+    T = d[t_col].astype(int)
+    Z = d[z_col].astype(int)
+    
+    if not set(T.unique()).issubset({0, 1}) or not set(Z.unique()).issubset({0, 1}):
+        raise ValueError("T and Z must be binary (0/1 or bool).")
+
+    # Counts F[t,z] with explicit order t=0,1 and z=0,1
+    # https://en.wikipedia.org/wiki/Contingency_table
+    F_df = pd.crosstab(T, Z).reindex(index=[0, 1], columns=[0, 1], fill_value=0)
+    F = F_df.to_numpy(dtype=int)
+    N = int(F.sum())
+    if N == 0:
+        raise ValueError("No data after dropping missing values.")
+
+    # Empirical joint/marginals from your computation
+    P = F / N                 # P(T,Z)
+    PT = P.sum(axis=1)        # P(T)
+    PZ = P.sum(axis=0)        # P(Z)
+
+    # Conditionals P(T|Z) as 2x2: rows t, cols z
+    # P(T=t | Z=z) = P(T=t,Z=z)/P(Z=z)
+    with np.errstate(divide="ignore", invalid="ignore"):
+        PT_given_Z = P / PZ   # broadcast over columns
+        PT_given_Z[:, PZ == 0] = np.nan
+
+    # Global tests of independence
+    # https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
+    # https://en.wikipedia.org/wiki/Yates%27s_correction_for_continuity
+    
+    #chi2_yates = np.any(F.ravel() < 5)
+    chi2, p_chi2, dof, expected = chi2_contingency(F, correction=chi2_yates)
+
+    # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
+    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html
+    # https://docs.scipy.org/doc/scipy/tutorial/stats/hypothesis_fisher_exact.html
+    # odds ratio 
+    #   OR = (P[1,1] P[0,0])/(P[1,0] P[0,1]) = O(Z=0)/O(Z=1) 
+    # with odds 
+    #   O(Z) = P(T=0|Z)/P(T=1|Z)
+    # Note: 
+    #   OR = 1: No association; the odds of the outcome are the same in both groups (Null Hypothesis).
+    #   'two-sided': Tests if the odds ratio is simply not 1
+
+    odds_ratio, p_fisher = fisher_exact(F, alternative="two-sided")
+
+    return {
+        "meta": {"t_col": t_col, "z_col": z_col, "alpha": float(alpha)},
+        "counts": {"F": F, "N": N},
+        "probabilities": {
+            "P_TZ": P,                 # 2x2 array [t,z]
+            "P_T": PT,                 # length-2 [t]
+            "P_Z": PZ,                 # length-2 [z]
+            "P_T_given_Z": PT_given_Z, # 2x2 array [t,z]
+        },
+        "global_tests": {
+            "chi2": {
+                "stat": float(chi2),
+                "dof": int(dof),
+                "pvalue": float(p_chi2),
+                "expected": expected.astype(float),
+                "reject_alpha": bool(p_chi2 < alpha),
+            },
+            "fisher": {
+                "odds_ratio": float(odds_ratio),   
+                "pvalue": float(p_fisher),
+                "reject_alpha": bool(p_fisher < alpha),
+            },
+        },
+    }

Some files were not shown because too many files changed in this diff