import numpy as np from scipy.optimize import curve_fit from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import ConstantKernel from custom_kernels import Gibbs, LinearNoiseKernel import matplotlib.pyplot as plt ########### User Defined Options ########## inFileName = "DATA/original_histograms/mass_mm_higgs_Background.npz" with np.load(inFileName) as data: bin_edges = data['bin_edges'] bin_centers = data['bin_centers'] bin_values = data['bin_values'] bin_errors = data['bin_errors'] # Whether or not to calculate systematic error contribution by varying the length scale calculateSystematics = True # How much (in fractional form) to vary the length scale when doing systematic GP fits # i.e. to vary by 20%, set as 0.2 systVarFrac = 0.02 # The hyper-parameter ranges to be used, if not using the outputs of the hyper-par optimization Gibbs_l0_bounds = [5, 25] Gibbs_l_slope_bounds = [10e-5, 0.05] # How to determine the errors while doing the GPR fit and in making the resulting smooth template. # Options are: # 1: Use the linear error kernel to flexibly approximate Poisson errors in the fit # 2: Use the original errors (non-flexible) from the input template, and assign these to the output # 3: Use the original errors (non-flexible) from the input template, but calculate Poisson errors for the output whichErrorTreatment = 2 useGPRLinearError = False useOriginalTemplateError = False usePoissonError = False if(whichErrorTreatment == 1): useGPRLinearError = True elif(whichErrorTreatment == 2): useOriginalTemplateError = True elif(whichErrorTreatment == 3): usePoissonError = True def mySimpleExponential(x, a, b): """Basic Exponential for GPR Prior""" return a * np.exp(b * x) ########################################### ########### The Important Stuff ########### nEvts = np.sum(bin_values) nBins = bin_values.size xMin = bin_edges[0] xMax = bin_edges[-1] # Define the GPR Prior (should be reasonably similar to the input template shape) # this makes the baseline (histogram/array) by a coarse fit to the input array, the final normalization should match the input fitfun = mySimpleExponential binshift = bin_centers - bin_centers[0] popt, pcov = curve_fit(fitfun, binshift, bin_values, p0=[bin_values[0], -0.05]) perr = np.sqrt(np.diag(pcov)) a, b = popt print("a,b", a, b) myBaseLine = np.array(fitfun(binshift, a, b)) myBaseLine = (nEvts / np.sum(myBaseLine)) * myBaseLine """ print("mya",nEvts,bin_values[1:5]) print("myb",np.sum(myBaseLine),myBaseLine[1:5]) xerrs = 0.5*(bin_edges[1:]-bin_edges[:-1]) # or None plt.errorbar(bin_centers, bin_values, bin_errors, xerrs, fmt="none",color='b',ecolor='b',label='Original histogram') #fmt='.k' plt.plot(bin_centers, fitfun(binshift, *popt), 'r-',label='fit: a=%5.3f, b=%5.3f' % tuple(popt)) plt.plot(bin_centers,myBaseLine,'g-', label='fit baseline') plt.legend() plt.savefig("m_mumu_fit.pdf",format="pdf") """ # Get estimates of different hyper-parameter bounds from the input template # Set bounds on the scaling of the GPR template (basically a maximum y-range) const_low = 10.0**-5 const_hi = nEvts * 10.0 Gibbs_l0 = Gibbs_l0_bounds[0] + 0.9 * (Gibbs_l0_bounds[1] - Gibbs_l0_bounds[0]) Gibbs_l_slope = Gibbs_l_slope_bounds[0] + 0.9 * (Gibbs_l_slope_bounds[1] - Gibbs_l_slope_bounds[0]) # Set bounds on the magnitude of the GPR error bars errConst0 = np.max(bin_errors) errConst_low = 0.90 * errConst0 errConst_hi = 1.10 * errConst0 # Estimate the (decreasing) slope of the magnitude of the GPR error bars slope_est0 = -1.0 * (bin_errors[0] - bin_errors[-1]) / (bin_centers[0] - bin_centers[-1]) if(slope_est0 < 10**-5): slope_est0 = 10**-4 slope_est_low = 0.5 * slope_est0 slope_est_hi = 1.05 * slope_est0 # Define the GPR Kernel if(useGPRLinearError): kernel = ( ConstantKernel(constant_value=1.0, constant_value_bounds=(const_low, const_hi)) * Gibbs(l0=Gibbs_l0, l0_bounds=(Gibbs_l0_bounds[0], Gibbs_l0_bounds[1]), l_slope=Gibbs_l_slope, l_slope_bounds=(Gibbs_l_slope_bounds[0], Gibbs_l_slope_bounds[1])) + ConstantKernel(constant_value=1.0, constant_value_bounds=(1.0, 1.0)) * LinearNoiseKernel(noise_level=errConst0, noise_level_bounds=(errConst_low, errConst_hi), b=slope_est0, b_bounds=(slope_est_low, slope_est_hi)) ) else: kernel = ( ConstantKernel(constant_value=1.0, constant_value_bounds=(const_low, const_hi)) * Gibbs(l0=Gibbs_l0, l0_bounds=(Gibbs_l0_bounds[0], Gibbs_l0_bounds[1]), l_slope=Gibbs_l_slope, l_slope_bounds=(Gibbs_l_slope_bounds[0], Gibbs_l_slope_bounds[1])) ) # Fit the GPR Kernel to the input template gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=3, alpha=bin_errors**2) gpr.fit(bin_centers.reshape(len(bin_centers), 1), (bin_values - myBaseLine).ravel()) # Check to see if GPR Prediction is effectively the same as the exponential prior # If so, re-do the GPR fitting, but with a flat prior instead of exponential prior gprPrediction, gprCov = gpr.predict(bin_centers.reshape(len(bin_centers), 1), return_cov=True) if(np.sqrt((gprPrediction * gprPrediction).sum()) / bin_values.sum() < 0.0001): myBaseLine = np.zeros(len(bin_values)) gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=3, normalize_y=True) gpr.fit(bin_centers.reshape(len(bin_centers), 1), (bin_values - myBaseLine).ravel()) gprPrediction, gprCov = gpr.predict(bin_centers.reshape(len(bin_centers), 1), return_cov=True) gprError = np.copy(np.diagonal(gprCov)) gprPrediction = gprPrediction + myBaseLine # Determine the error bars based on requested error treatment outputErrorBars = np.zeros(len(gprPrediction), 'd') # Use the GPR template's linear error kernel for errors if(useGPRLinearError): outputErrorBars = gprError.copy() # Use the original template's errors if(useOriginalTemplateError): outputErrorBars = bin_errors.copy() # Calculate Poisson errors if(usePoissonError): for i in range(len(gprPrediction)): outputErrorBars[i] = np.sqrt(gprPrediction[i]) # Calculate Systematic Errors by Varying the Length Scale if(calculateSystematics): print("\n Peforming systematics variations of the lengths scales\n") # Define the systematic kernels (vary the length scale by some fraction, keeping the other parameters constant theta_nom = gpr.kernel_.theta theta_up = [] theta_down = [] for ipar, hyperpar in enumerate(gpr.kernel_.hyperparameters): theta_up.append(theta_nom[ipar]) theta_down.append(theta_nom[ipar]) if(('l0' in hyperpar.name) or ('length_scale' in hyperpar.name)): theta_up[ipar] += np.log(1.0 + systVarFrac) theta_down[ipar] += np.log(1.0 - systVarFrac) kernel_systUp = gpr.kernel_.clone_with_theta(theta_up) kernel_systDown = gpr.kernel_.clone_with_theta(theta_down) # Perform the fits using the systematic kernels gpr_systUp = GaussianProcessRegressor(kernel=kernel_systUp, n_restarts_optimizer=3, alpha=bin_errors**2) gpr_systDown = GaussianProcessRegressor(kernel=kernel_systDown, n_restarts_optimizer=3, alpha=bin_errors**2) gpr_systUp.fit(bin_centers.reshape(len(bin_centers), 1), (bin_values - myBaseLine).ravel()) gpr_systDown.fit(bin_centers.reshape(len(bin_centers), 1), (bin_values - myBaseLine).ravel()) gprPrediction_systUp = gpr_systUp.predict(bin_centers.reshape(len(bin_centers), 1)) gprPrediction_systUp = gprPrediction_systUp + myBaseLine gprPrediction_systDown = gpr_systDown.predict(bin_centers.reshape(len(bin_centers), 1)) gprPrediction_systDown = gprPrediction_systDown + myBaseLine # Calculate the error bars (maximum of the two variations) gprHiErrors = np.zeros(len(gprPrediction), 'd') gprLowErrors = np.zeros(len(gprPrediction), 'd') gprSystErrors = np.zeros(len(gprPrediction), 'd') for i in range(len(gprPrediction)): gprHiErrors[i] = np.abs(gprPrediction[i] - np.max([gprPrediction_systUp[i], gprPrediction_systDown[i]])) gprLowErrors[i] = np.abs(gprPrediction[i] - np.min([gprPrediction_systUp[i], gprPrediction_systDown[i]])) gprSystErrors[i] += np.max([gprHiErrors[i], gprLowErrors[i]]) outputErrorBars[i] += gprSystErrors[i] gprHiErrors = np.array(gprHiErrors) gprLowErrors = np.array(gprLowErrors) gprSystErrors = np.array(gprSystErrors) ########################################################################################################## # PLOT ########################################################################################################## f, (ax1, ax2) = plt.subplots(2, sharex=True, figsize=(6, 6), gridspec_kw={'height_ratios': [3, 1]}) col = 'g' cols = 'r' logy = False ymax = 1.2 * 10**5 yloc = 0.95 xwidths = (bin_edges[1:] - bin_edges[:-1]) xerrs = 0.4 * xwidths ys = bin_values yerrs = bin_errors ax1.errorbar(bin_centers, ys, yerrs, xerrs, fmt=".", color='k', ecolor='k') # fmt='.k' gys = gprPrediction gerrs = outputErrorBars ax1.errorbar(bin_centers, gys, gerrs, xerrs, fmt="none", color=col, ecolor=col) # fmt='.k' if(calculateSystematics): yval = gprPrediction yvalm = gprPrediction_systDown yvalp = gprPrediction_systUp yerr = gprSystErrors ax1.fill_between(bin_centers, yvalm, yvalp, color=col, alpha=0.2) import matplotlib.lines as mlines gprlab = mlines.Line2D([], [], color=col, markersize=10, label='Smoothed GPR') orglab = mlines.Line2D([], [], color='k', marker=".", lw=0, markersize=10, label='Original histogram') ax1.legend(handles=[orglab, gprlab], loc='upper right', frameon=0, bbox_to_anchor=(0.95, yloc)) ax1.set_ylabel("Events/bin") ax1.set_xlabel(r"$m_{\mu\mu}$ [GeV]") ax1.set_xlim([bin_edges[0], bin_edges[-1]]) ax1.set_ylim([0.01, ymax]) # prettyfy #1 from matplotlib import ticker formatter = ticker.ScalarFormatter(useMathText=True) formatter.set_scientific(True) formatter.set_powerlimits((-1, 1)) ax1.yaxis.set_major_formatter(formatter) ax1.tick_params(labeltop=False, labelright=False) plt.xlabel(ax1.get_xlabel(), horizontalalignment='right', x=1.0) plt.ylabel(ax1.get_ylabel(), horizontalalignment='right', y=1.0) from matplotlib.ticker import AutoMinorLocator ax1.xaxis.set_minor_locator(AutoMinorLocator()) ax1.yaxis.set_minor_locator(AutoMinorLocator()) ax1.tick_params(axis='y', direction="in", left=1, right=1, which='both') ax1.tick_params(axis='x', direction="in", bottom=1, top=1, which='both') ax1.tick_params(axis='both', which='major', length=12) ax1.tick_params(axis='both', which='minor', length=6) if logy: from matplotlib.ticker import NullFormatter, LogLocator ax1.set_yscale("log") locmin = LogLocator(base=10.0, subs=(0.2, 0.4, 0.6, 0.8, 1, 2, 4, 6, 8, 9, 10)) locmin = LogLocator(base=10.0, subs=(2, 4, 6, 8, 10)) ax1.yaxis.set_minor_locator(locmin) ax1.yaxis.set_minor_formatter(NullFormatter()) finelab = ax1.yaxis.get_ticklabels() finelab[1].set_visible(False) # ratio plot zvals = ys / gys ry = yerrs / ys rb = gerrs / gys zerrs = zvals * np.sqrt(ry * ry + rb * rb) ax2.errorbar(bin_centers, zvals, xerr=xerrs, yerr=zerrs, fmt='none', color=col, markersize=10) ax2.set_ylabel('Org/Smooth') # fine y-label control for overlap finelab = ax2.yaxis.get_ticklabels() finelab[0].set_visible(False) if not logy: finelab[-1].set_visible(False) ax2.set_xlabel(r"$m_{\mu\mu}$ [GeV]") ax2.set_xlim([bin_edges[0], bin_edges[-1]]) ax2.set_ylim([0.8, 1.2]) ax2.axhline(1, color='k', lw=1) # prettyfy #2 ax2.tick_params(labeltop=False, labelright=False) plt.xlabel(ax2.get_xlabel(), horizontalalignment='right', x=1.0) plt.ylabel(ax2.get_ylabel(), horizontalalignment='center', y=0.5) from matplotlib.ticker import AutoMinorLocator ax2.xaxis.set_minor_locator(AutoMinorLocator()) ax2.yaxis.set_minor_locator(AutoMinorLocator()) ax2.tick_params(axis='y', direction="in", left=1, right=1, which='both') ax2.tick_params(axis='x', direction="in", bottom=1, top=1, which='both') ax2.tick_params(axis='both', which='major', length=12) ax2.tick_params(axis='both', which='minor', length=6) f.subplots_adjust(hspace=0) plt.savefig("m_mumu_smooth.pdf", format="pdf") plt.show() plt.close()