fitting_example_GPR-Cern.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. import numpy as np
  2. from scipy.optimize import curve_fit
  3. from sklearn.gaussian_process import GaussianProcessRegressor
  4. from sklearn.gaussian_process.kernels import ConstantKernel
  5. from custom_kernels import Gibbs, LinearNoiseKernel
  6. import matplotlib.pyplot as plt
  7. ########### User Defined Options ##########
  8. inFileName = "DATA/original_histograms/mass_mm_higgs_Background.npz"
  9. with np.load(inFileName) as data:
  10. bin_edges = data['bin_edges']
  11. bin_centers = data['bin_centers']
  12. bin_values = data['bin_values']
  13. bin_errors = data['bin_errors']
  14. # Whether or not to calculate systematic error contribution by varying the length scale
  15. calculateSystematics = True
  16. # How much (in fractional form) to vary the length scale when doing systematic GP fits
  17. # i.e. to vary by 20%, set as 0.2
  18. systVarFrac = 0.02
  19. # The hyper-parameter ranges to be used, if not using the outputs of the hyper-par optimization
  20. Gibbs_l0_bounds = [5, 25]
  21. Gibbs_l_slope_bounds = [10e-5, 0.05]
  22. # How to determine the errors while doing the GPR fit and in making the resulting smooth template.
  23. # Options are:
  24. # 1: Use the linear error kernel to flexibly approximate Poisson errors in the fit
  25. # 2: Use the original errors (non-flexible) from the input template, and assign these to the output
  26. # 3: Use the original errors (non-flexible) from the input template, but calculate Poisson errors for the output
  27. whichErrorTreatment = 2
  28. useGPRLinearError = False
  29. useOriginalTemplateError = False
  30. usePoissonError = False
  31. if(whichErrorTreatment == 1): useGPRLinearError = True
  32. elif(whichErrorTreatment == 2): useOriginalTemplateError = True
  33. elif(whichErrorTreatment == 3): usePoissonError = True
  34. def mySimpleExponential(x, a, b):
  35. """Basic Exponential for GPR Prior"""
  36. return a * np.exp(b * x)
  37. ###########################################
  38. ########### The Important Stuff ###########
  39. nEvts = np.sum(bin_values)
  40. nBins = bin_values.size
  41. xMin = bin_edges[0]
  42. xMax = bin_edges[-1]
  43. # Define the GPR Prior (should be reasonably similar to the input template shape)
  44. # this makes the baseline (histogram/array) by a coarse fit to the input array, the final normalization should match the input
  45. fitfun = mySimpleExponential
  46. binshift = bin_centers - bin_centers[0]
  47. popt, pcov = curve_fit(fitfun, binshift, bin_values, p0=[bin_values[0], -0.05])
  48. perr = np.sqrt(np.diag(pcov))
  49. a, b = popt
  50. print("a,b", a, b)
  51. myBaseLine = np.array(fitfun(binshift, a, b))
  52. myBaseLine = (nEvts / np.sum(myBaseLine)) * myBaseLine
  53. """ print("mya",nEvts,bin_values[1:5])
  54. print("myb",np.sum(myBaseLine),myBaseLine[1:5])
  55. xerrs = 0.5*(bin_edges[1:]-bin_edges[:-1]) # or None
  56. plt.errorbar(bin_centers, bin_values, bin_errors, xerrs, fmt="none",color='b',ecolor='b',label='Original histogram') #fmt='.k'
  57. plt.plot(bin_centers, fitfun(binshift, *popt), 'r-',label='fit: a=%5.3f, b=%5.3f' % tuple(popt))
  58. plt.plot(bin_centers,myBaseLine,'g-', label='fit baseline')
  59. plt.legend()
  60. plt.savefig("m_mumu_fit.pdf",format="pdf") """
  61. # Get estimates of different hyper-parameter bounds from the input template
  62. # Set bounds on the scaling of the GPR template (basically a maximum y-range)
  63. const_low = 10.0**-5
  64. const_hi = nEvts * 10.0
  65. Gibbs_l0 = Gibbs_l0_bounds[0] + 0.9 * (Gibbs_l0_bounds[1] - Gibbs_l0_bounds[0])
  66. Gibbs_l_slope = Gibbs_l_slope_bounds[0] + 0.9 * (Gibbs_l_slope_bounds[1] - Gibbs_l_slope_bounds[0])
  67. # Set bounds on the magnitude of the GPR error bars
  68. errConst0 = np.max(bin_errors)
  69. errConst_low = 0.90 * errConst0
  70. errConst_hi = 1.10 * errConst0
  71. # Estimate the (decreasing) slope of the magnitude of the GPR error bars
  72. slope_est0 = -1.0 * (bin_errors[0] - bin_errors[-1]) / (bin_centers[0] - bin_centers[-1])
  73. if(slope_est0 < 10**-5): slope_est0 = 10**-4
  74. slope_est_low = 0.5 * slope_est0
  75. slope_est_hi = 1.05 * slope_est0
  76. # Define the GPR Kernel
  77. if(useGPRLinearError):
  78. kernel = (
  79. ConstantKernel(constant_value=1.0, constant_value_bounds=(const_low, const_hi))
  80. * Gibbs(l0=Gibbs_l0, l0_bounds=(Gibbs_l0_bounds[0], Gibbs_l0_bounds[1]), l_slope=Gibbs_l_slope, l_slope_bounds=(Gibbs_l_slope_bounds[0], Gibbs_l_slope_bounds[1]))
  81. + ConstantKernel(constant_value=1.0, constant_value_bounds=(1.0, 1.0))
  82. * LinearNoiseKernel(noise_level=errConst0, noise_level_bounds=(errConst_low, errConst_hi), b=slope_est0, b_bounds=(slope_est_low, slope_est_hi))
  83. )
  84. else:
  85. kernel = (
  86. ConstantKernel(constant_value=1.0, constant_value_bounds=(const_low, const_hi))
  87. * Gibbs(l0=Gibbs_l0, l0_bounds=(Gibbs_l0_bounds[0], Gibbs_l0_bounds[1]), l_slope=Gibbs_l_slope, l_slope_bounds=(Gibbs_l_slope_bounds[0], Gibbs_l_slope_bounds[1]))
  88. )
  89. # Fit the GPR Kernel to the input template
  90. gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=3, alpha=2 * bin_errors)
  91. gpr.fit(bin_centers.reshape(len(bin_centers), 1), (bin_values - myBaseLine).ravel())
  92. # Check to see if GPR Prediction is effectively the same as the exponential prior
  93. # If so, re-do the GPR fitting, but with a flat prior instead of exponential prior
  94. gprPrediction, gprCov = gpr.predict(bin_centers.reshape(len(bin_centers), 1), return_cov=True)
  95. if(np.sqrt((gprPrediction * gprPrediction).sum()) / bin_values.sum() < 0.0001):
  96. myBaseLine = np.zeros(len(bin_values))
  97. gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=3, normalize_y=True)
  98. gpr.fit(bin_centers.reshape(len(bin_centers), 1), (bin_values - myBaseLine).ravel())
  99. gprPrediction, gprCov = gpr.predict(bin_centers.reshape(len(bin_centers), 1), return_cov=True)
  100. gprError = np.copy(np.diagonal(gprCov))
  101. gprPrediction = gprPrediction + myBaseLine
  102. # Determine the error bars based on requested error treatment
  103. outputErrorBars = np.zeros(len(gprPrediction), 'd')
  104. # Use the GPR template's linear error kernel for errors
  105. if(useGPRLinearError): outputErrorBars = gprError.copy()
  106. # Use the original template's errors
  107. if(useOriginalTemplateError): outputErrorBars = bin_errors.copy()
  108. # Calculate Poisson errors
  109. if(usePoissonError):
  110. for i in range(len(gprPrediction)):
  111. outputErrorBars[i] = np.sqrt(gprPrediction[i])
  112. # Calculate Systematic Errors by Varying the Length Scale
  113. if(calculateSystematics):
  114. print("\n Peforming systematics variations of the lengths scales\n")
  115. # Define the systematic kernels (vary the length scale by some fraction, keeping the other parameters constant
  116. theta_nom = gpr.kernel_.theta
  117. theta_up = []
  118. theta_down = []
  119. for ipar, hyperpar in enumerate(gpr.kernel_.hyperparameters):
  120. theta_up.append(theta_nom[ipar])
  121. theta_down.append(theta_nom[ipar])
  122. if(('l0' in hyperpar.name) or ('length_scale' in hyperpar.name)):
  123. theta_up[ipar] += np.log(1.0 + systVarFrac)
  124. theta_down[ipar] += np.log(1.0 - systVarFrac)
  125. kernel_systUp = gpr.kernel_.clone_with_theta(theta_up)
  126. kernel_systDown = gpr.kernel_.clone_with_theta(theta_down)
  127. # Perform the fits using the systematic kernels
  128. gpr_systUp = GaussianProcessRegressor(kernel=kernel_systUp, n_restarts_optimizer=3, alpha=2 * bin_errors)
  129. gpr_systDown = GaussianProcessRegressor(kernel=kernel_systDown, n_restarts_optimizer=3, alpha=2 * bin_errors)
  130. gpr_systUp.fit(bin_centers.reshape(len(bin_centers), 1), (bin_values - myBaseLine).ravel())
  131. gpr_systDown.fit(bin_centers.reshape(len(bin_centers), 1), (bin_values - myBaseLine).ravel())
  132. gprPrediction_systUp = gpr_systUp.predict(bin_centers.reshape(len(bin_centers), 1))
  133. gprPrediction_systUp = gprPrediction_systUp + myBaseLine
  134. gprPrediction_systDown = gpr_systDown.predict(bin_centers.reshape(len(bin_centers), 1))
  135. gprPrediction_systDown = gprPrediction_systDown + myBaseLine
  136. # Calculate the error bars (maximum of the two variations)
  137. gprHiErrors = np.zeros(len(gprPrediction), 'd')
  138. gprLowErrors = np.zeros(len(gprPrediction), 'd')
  139. gprSystErrors = np.zeros(len(gprPrediction), 'd')
  140. for i in range(len(gprPrediction)):
  141. gprHiErrors[i] = np.abs(gprPrediction[i] - np.max([gprPrediction_systUp[i], gprPrediction_systDown[i]]))
  142. gprLowErrors[i] = np.abs(gprPrediction[i] - np.min([gprPrediction_systUp[i], gprPrediction_systDown[i]]))
  143. gprSystErrors[i] += np.max([gprHiErrors[i], gprLowErrors[i]])
  144. outputErrorBars[i] += gprSystErrors[i]
  145. gprHiErrors = np.array(gprHiErrors)
  146. gprLowErrors = np.array(gprLowErrors)
  147. gprSystErrors = np.array(gprSystErrors)
  148. ##########################################################################################################
  149. # PLOT
  150. ##########################################################################################################
  151. f, (ax1, ax2) = plt.subplots(2, sharex=True, figsize=(6, 6), gridspec_kw={'height_ratios': [3, 1]})
  152. col = 'g'
  153. cols = 'r'
  154. logy = False
  155. ymax = 1.2 * 10**5
  156. yloc = 0.95
  157. xwidths = (bin_edges[1:] - bin_edges[:-1])
  158. xerrs = 0.4 * xwidths
  159. ys = bin_values
  160. yerrs = bin_errors
  161. ax1.errorbar(bin_centers, ys, yerrs, xerrs, fmt=".", color='k', ecolor='k') # fmt='.k'
  162. gys = gprPrediction
  163. gerrs = outputErrorBars
  164. ax1.errorbar(bin_centers, gys, gerrs, xerrs, fmt="none", color=col, ecolor=col) # fmt='.k'
  165. if(calculateSystematics):
  166. yval = gprPrediction
  167. yvalm = gprPrediction_systDown
  168. yvalp = gprPrediction_systUp
  169. yerr = gprSystErrors
  170. ax1.fill_between(bin_centers, yvalm, yvalp, color=col, alpha=0.2)
  171. import matplotlib.lines as mlines
  172. gprlab = mlines.Line2D([], [], color=col, markersize=10, label='Smoothed GPR')
  173. orglab = mlines.Line2D([], [], color='k', marker=".", lw=0, markersize=10, label='Original histogram')
  174. ax1.legend(handles=[orglab, gprlab], loc='upper right', frameon=0, bbox_to_anchor=(0.95, yloc))
  175. ax1.set_ylabel("Events/bin")
  176. ax1.set_xlabel(r"$m_{\mu\mu}$ [GeV]")
  177. ax1.set_xlim([bin_edges[0], bin_edges[-1]])
  178. ax1.set_ylim([0.01, ymax])
  179. # prettyfy #1
  180. from matplotlib import ticker
  181. formatter = ticker.ScalarFormatter(useMathText=True)
  182. formatter.set_scientific(True)
  183. formatter.set_powerlimits((-1, 1))
  184. ax1.yaxis.set_major_formatter(formatter)
  185. ax1.tick_params(labeltop=False, labelright=False)
  186. plt.xlabel(ax1.get_xlabel(), horizontalalignment='right', x=1.0)
  187. plt.ylabel(ax1.get_ylabel(), horizontalalignment='right', y=1.0)
  188. from matplotlib.ticker import AutoMinorLocator
  189. ax1.xaxis.set_minor_locator(AutoMinorLocator())
  190. ax1.yaxis.set_minor_locator(AutoMinorLocator())
  191. ax1.tick_params(axis='y', direction="in", left=1, right=1, which='both')
  192. ax1.tick_params(axis='x', direction="in", bottom=1, top=1, which='both')
  193. ax1.tick_params(axis='both', which='major', length=12)
  194. ax1.tick_params(axis='both', which='minor', length=6)
  195. if logy:
  196. from matplotlib.ticker import NullFormatter, LogLocator
  197. ax1.set_yscale("log")
  198. locmin = LogLocator(base=10.0, subs=(0.2, 0.4, 0.6, 0.8, 1, 2, 4, 6, 8, 9, 10))
  199. locmin = LogLocator(base=10.0, subs=(2, 4, 6, 8, 10))
  200. ax1.yaxis.set_minor_locator(locmin)
  201. ax1.yaxis.set_minor_formatter(NullFormatter())
  202. finelab = ax1.yaxis.get_ticklabels()
  203. finelab[1].set_visible(False)
  204. # ratio plot
  205. zvals = ys / gys
  206. ry = yerrs / ys
  207. rb = gerrs / gys
  208. zerrs = zvals * np.sqrt(ry * ry + rb * rb)
  209. ax2.errorbar(bin_centers, zvals, xerr=xerrs, yerr=zerrs, fmt='none', color=col, markersize=10)
  210. ax2.set_ylabel('Org/Smooth')
  211. # fine y-label control for overlap
  212. finelab = ax2.yaxis.get_ticklabels()
  213. finelab[0].set_visible(False)
  214. if not logy:
  215. finelab[-1].set_visible(False)
  216. ax2.set_xlabel(r"$m_{\mu\mu}$ [GeV]")
  217. ax2.set_xlim([bin_edges[0], bin_edges[-1]])
  218. ax2.set_ylim([0.8, 1.2])
  219. ax2.axhline(1, color='k', lw=1)
  220. # prettyfy #2
  221. ax2.tick_params(labeltop=False, labelright=False)
  222. plt.xlabel(ax2.get_xlabel(), horizontalalignment='right', x=1.0)
  223. plt.ylabel(ax2.get_ylabel(), horizontalalignment='center', y=0.5)
  224. from matplotlib.ticker import AutoMinorLocator
  225. ax2.xaxis.set_minor_locator(AutoMinorLocator())
  226. ax2.yaxis.set_minor_locator(AutoMinorLocator())
  227. ax2.tick_params(axis='y', direction="in", left=1, right=1, which='both')
  228. ax2.tick_params(axis='x', direction="in", bottom=1, top=1, which='both')
  229. ax2.tick_params(axis='both', which='major', length=12)
  230. ax2.tick_params(axis='both', which='minor', length=6)
  231. f.subplots_adjust(hspace=0)
  232. plt.savefig("m_mumu_smooth.pdf", format="pdf")
  233. plt.show()
  234. plt.close()