metadata.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. """Metadata and patient-label utilities for the PET/SUV dataset."""
  2. from __future__ import annotations
  3. import pathlib
  4. import re
  5. import pandas as pd
  6. def get_meta_data(data_raw_path: str | pathlib.Path) -> pd.DataFrame:
  7. """Collect PET/SEG file metadata from a raw dataset directory.
  8. The function expects paths with the form::
  9. <patient_id>/VISIT_<visit>/<patient_id>_VISIT_<visit>_<organ>_<modality>.nii.gz
  10. where ``modality`` is either ``PET`` or ``SEG``. The returned dataframe is
  11. indexed by ``patient_id``, ``organ`` and ``visit`` and contains PET/SEG
  12. filenames and paths in a wide format.
  13. Parameters
  14. ----------
  15. data_raw_path:
  16. Root directory containing the NIfTI files.
  17. Returns
  18. -------
  19. pandas.DataFrame
  20. Metadata table with columns ``PET_filename``, ``PET_path``,
  21. ``SEG_filename`` and ``SEG_path``.
  22. """
  23. pattern = re.compile(
  24. r"(?P<patient_id>NIX-LJU-D\d+-IRAE-A\d+)/VISIT_(?P<visit>\d+)/"
  25. r"(?P=patient_id)_VISIT_(?P=visit)_(?P<organ>.+)_(?P<modality>PET|SEG)\.nii\.gz$"
  26. )
  27. rows: list[dict[str, object]] = []
  28. for item in pathlib.Path(data_raw_path).rglob("*.nii.gz"):
  29. match = pattern.search(str(item))
  30. if match:
  31. rows.append(
  32. {
  33. "filename": item.name,
  34. "path": str(item),
  35. "patient_id": match.group("patient_id"),
  36. "visit": int(match.group("visit")),
  37. "organ": match.group("organ"),
  38. "modality": match.group("modality"),
  39. }
  40. )
  41. else:
  42. print(f"Could not parse: {item}")
  43. df = pd.DataFrame(rows)
  44. if df.empty:
  45. return df
  46. index_cols = ["patient_id", "organ", "visit"]
  47. counts = df.groupby(index_cols + ["modality"]).size()
  48. duplicates = counts[counts > 1]
  49. if not duplicates.empty:
  50. raise ValueError(f"Duplicate images found:\n{duplicates}")
  51. df_wide = df.pivot(index=index_cols, columns="modality", values=["filename", "path"])
  52. # Flatten MultiIndex columns: (field, modality) -> MODALITY_field.
  53. df_wide.columns = [f"{modality}_{field}" for field, modality in df_wide.columns]
  54. expected_cols = ["PET_filename", "PET_path", "SEG_filename", "SEG_path"]
  55. for col in expected_cols:
  56. if col not in df_wide.columns:
  57. df_wide[col] = pd.NA
  58. return df_wide[expected_cols].sort_index()
  59. def flag_corrupted_files(df: pd.DataFrame) -> pd.DataFrame:
  60. """Add an ``is_corrupted`` flag for known corrupted patient-organ-visits.
  61. Parameters
  62. ----------
  63. df:
  64. Metadata dataframe indexed by ``patient_id``, ``organ`` and ``visit``.
  65. Returns
  66. -------
  67. pandas.DataFrame
  68. Copy of ``df`` with an added boolean column ``is_corrupted``.
  69. """
  70. df = df.copy()
  71. prefix = "NIX-LJU-D2002-IRAE-A"
  72. corrupted = {
  73. (13, "Lung", 1),
  74. (14, "Lung", 2),
  75. (24, "Lung", 0),
  76. (1, "Colon", 0),
  77. (16, "Colon", 0),
  78. }
  79. corrupted_ids = [
  80. (f"{prefix}{patient_id:03d}", organ, visit)
  81. for patient_id, organ, visit in corrupted
  82. ]
  83. df["is_corrupted"] = False
  84. existing_ids = [idx for idx in corrupted_ids if idx in df.index]
  85. missing_ids = [idx for idx in corrupted_ids if idx not in df.index]
  86. if missing_ids:
  87. print("Warning: these corrupted IDs were not found in the dataframe:")
  88. for idx in missing_ids:
  89. print(idx)
  90. df.loc[existing_ids, "is_corrupted"] = True
  91. return df
  92. def flag_AE_patients(df: pd.DataFrame) -> pd.DataFrame:
  93. """Add an ``is_AE_patient`` flag for known AE patient-organ pairs.
  94. The AE label is assigned at patient-organ level and therefore applies to all
  95. visits of a given patient-organ pair.
  96. Parameters
  97. ----------
  98. df:
  99. Metadata dataframe indexed by ``patient_id``, ``organ`` and ``visit``.
  100. Returns
  101. -------
  102. pandas.DataFrame
  103. Copy of ``df`` with an added boolean column ``is_AE_patient``.
  104. """
  105. df = df.copy()
  106. prefix = "NIX-LJU-D2002-IRAE-A"
  107. ae_patients = {
  108. "Thyroid": [1, 2, 14, 17, 18, 20, 21, 22],
  109. "Lung": [1, 2, 4, 20],
  110. "Colon": [5, 7, 18, 28],
  111. }
  112. ae_keys = {
  113. (f"{prefix}{patient:03d}", organ)
  114. for organ, patients in ae_patients.items()
  115. for patient in patients
  116. }
  117. index_df = df.index.to_frame(index=False)
  118. df["is_AE_patient"] = [
  119. (patient_id, organ) in ae_keys
  120. for patient_id, organ in zip(index_df["patient_id"], index_df["organ"])
  121. ]
  122. return df
  123. def get_AE_statistics(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series, int]:
  124. """Summarize AE patient-organ labels.
  125. Parameters
  126. ----------
  127. df:
  128. Metadata dataframe with a boolean column ``is_AE_patient``.
  129. Returns
  130. -------
  131. tuple
  132. ``(AE_patient_organs, AE_counts_by_organ, n_AE_patient_organ_pairs)``.
  133. """
  134. if "is_AE_patient" not in df.columns:
  135. raise ValueError("DataFrame must contain column 'is_AE_patient'.")
  136. AE_patient_organs = (
  137. df.loc[df["is_AE_patient"]]
  138. .index.to_frame(index=False)[["patient_id", "organ"]]
  139. .drop_duplicates()
  140. .sort_values(["organ", "patient_id"])
  141. .reset_index(drop=True)
  142. )
  143. AE_counts_by_organ = (
  144. AE_patient_organs.groupby("organ")
  145. .size()
  146. .rename("n_AE_patient_organ_pairs")
  147. .sort_index()
  148. )
  149. n_AE_patient_organ_pairs = len(AE_patient_organs)
  150. return AE_patient_organs, AE_counts_by_organ, n_AE_patient_organ_pairs